htmlparser解析下来的文本进行储存与建立索引问题
运行报错,
package com.luceneheritrixbook.core;
import java.io.*;
import java.util.*;
import racebean.UserBean;
import racecl.UsersCl;
//import racebean.UserBean;
//import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.Raceindex;
//import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class RaceTextFileProcessor {
/**
* directories for those stored product info txt files
*/
private String[] directories;
private static final String dbUrl = PropertyConfiguration.getDBUrl();
private static final String dbUsr = PropertyConfiguration.getDBUsr();
private static final String dbPwd = PropertyConfiguration.getDBPwd();
private UsersCl usersCl = null;
private Raceindex indexer = null;
public final static int SUMMARY_LENGTH = 80;
private static final String indexPath = PropertyConfiguration.getIndexStorePath();
/**
* Default constructor
*
*/
public RaceTextFileProcessor() {
initialize();
}
public void initialize() {
try {
usersCl = new UsersCl(dbUrl,dbUsr,dbPwd);
indexer = new Raceindex(indexPath);
}catch(Exception e){
e.printStackTrace();
}
}
public void setDirectories(String[] directories) {
this.directories = directories;
}
protected void process() throws Exception {
if (usersCl == null) {
throw new Exception("Database connection failed, pls retry!!");
}
if (directories == null || directories.length == 0) {
return;
}
try {
for (int i = 0; i < directories.length; i++) {
File f = new File(directories[i]);
traverse(f);
}
closeDB();
Thread.sleep(10);
closeIndex();
} catch (Exception e) {
e.printStackTrace();
}
}
private void traverse(File file) throws Exception {
String [] files = file.list();
for (int i = 0; i < files.length; i++) {
File productfile = new File(file, files[i]);
String fname = productfile.getName();
System.out.println(productfile);
BufferedReader reader = new BufferedReader(new FileReader(productfile));
String url = reader.readLine();
String title = reader.readLine();
String type = reader.readLine();
String imageURI = "";
String updatetime = fname.substring(fname.lastIndexOf("-")+1, fname.lastIndexOf("."));
StringBuffer content = new StringBuffer();
String line = reader.readLine();
while (line != null && !line.equals(Extractor.SEPARATOR)){
content.append(line).append("\r\n");
line = reader.readLine();
}
imageURI = reader.readLine();
// make the Product object
UserBean p = new UserBean();
//p.setCategory("产品");
p.setTitle(title);
//p.setType(type);
p.setImageurl(imageURI);
p.setUrl(url);
String contentstr = content.toString();
p.setContent(contentstr);
if (contentstr.length() > SUMMARY_LENGTH) {
p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
}
else
p.setSummary(contentstr);
p.setUpdatetime(updatetime);
// save to db first, and get the returned ID
int nextid = insert2DB(p);
// now we are trying to build Lucene document
buildIndex(p, nextid);
Thread.sleep(10);
}
optimizeIndex();
}
protected int insert2DB(UserBean p) throws Exception {
return usersCl.addProduct(p);
}
protected void buildIndex(UserBean p, int nextid) throws Exception {
indexer.addProduct(p, nextid);
}
private void optimizeIndex() throws Exception {
indexer.optimizeIndex();
}
private void closeIndex() throws Exception {
indexer.close();
}
private void closeDB() {
usersCl.close();
}
public String getDbPwd() {
return dbPwd;
}
public String getDbUrl() {
return dbUrl;
}
public String getDbUsr() {
return dbUsr;
}
public String getIndexPath() {
return indexPath;
}
}
貌似是遍历的问题,但是不知道怎么修改 索引 lucene --------------------编程问答-------------------- fname.lastIndexOf("-")+1, fname.lastIndexOf(".")看看这两个值是不是有个-1的 --------------------编程问答-------------------- 说个题外话,推荐用jSoup……比parser更简单易用
补充:Java , Java相关