至顶网›软件频道 ›如何使用Lucene对html文件进行索引

如何使用Lucene对html文件进行索引

扫一扫
分享文章到微信
扫一扫
关注官方公众号
至顶头条

　　我修改了lucene的demo包的IndexHTML类，使其可以被其他Java类调用。

作者：中国IT实验室来源：中国IT实验室 2007年9月24日

　　我修改了lucene的demo包的IndexHTML类，使其可以被其他Java类调用。
　　
　　IndexHTML类
　　
　　import org.apache.lucene.analysis.standard.StandardAnalyzer;
　　
　　import org.apache.lucene.document.Document;
　　
　　import org.apache.lucene.index.IndexReader;
　　
　　import org.apache.lucene.index.IndexWriter;
　　
　　import org.apache.lucene.index.Term;
　　
　　import org.apache.lucene.index.TermEnum;
　　
　　import java.io.File;import java.util.Date;
　　
　　import java.util.Arrays;
　　
　　//还需调用demo的其他类。
　　
　　import org.apache.lucene.demo;
　　
　　/**
　　
　　* Create html file index for searching
　　
　　* @author tyrone
　　
　　*
　　
　　*/public class IndexHTML { private String DocsPath=null;
　　
　　/**
　　
　　* the path for index file;
　　
　　*/ private String IndexFilePath=null;
　　
　　/**
　　
　　* true during deletion pass
　　
　　*/　 private boolean deleting = false;
　　
　　/**
　　
　　* existing index
　　
　　*/　 private IndexReader reader;
　　
　　/**
　　
　　* new index being built
　　
　　*/　 private IndexWriter writer;
　　
　　/**
　　
　　* document id iterator
　　
　　*/　 private TermEnum uidIter;
　　
　　private void indexDocs(File file)throws Exception {
　　
　　if (file.isDirectory())
　　
　　{
　　
　　// if a directory　 String[] files = file.list();
　　
　　// list its files　 Arrays.sort(files);
　　
　　// sort the files　 for (int i = 0; i < files.length;
　　
　　i++)　 // recursively index them　　this.indexDocs(new File(file, files[i]));
　　
　　} else if (file.getPath().endsWith(".html") || // index .html files　　file.getPath().endsWith(".htm") || // index .htm files　　file.getPath().endsWith(".txt")) { // index .txt files　　　if (this.uidIter != null) {　　String uid = HTMLDocument.uid(file);
　　
　　// construct uid for doc
　　
　　while (uidIter.term() != null && uidIter.term().field() == "uid" &&
　　
　　uidIter.term().text().compareTo(uid) <0) {
　　
　　if (deleting) {
　　
　　// delete stale docs
　　
　　System.out.println("deleting " +
　　
　　HTMLDocument.uid2url(uidIter.term().text()));
　　
　　reader.delete(uidIter.term());
　　
　　}
　　
　　uidIter.next();
　　
　　}
　　
　　if (uidIter.term() != null && uidIter.term().field() == "uid" &&
　　
　　uidIter.term().text().compareTo(uid) == 0) {
　　
　　uidIter.next();
　　
　　// keep matching docs
　　
　　} else if (!deleting) {
　　
　　// add new docs
　　
　　Document doc = HTMLDocument.Document(file);
　　
　　System.out.println("adding " + doc.get("url"));
　　
　　writer.addDocument(doc);
　　
　　}
　　
　　} else { // creating a new index
　　
　　Document doc = HTMLDocument.Document(file);
　　
　　System.out.println("adding " + doc.get("url"));
　　
　　writer.addDocument(doc);
　　
　　// add docs unconditionally
　　
　　}
　　
　　}　return;
　　
　　}
　　
　　/**
　　
　　* Walk directory hierarchy in uid order, while keeping uid iterator from
　　
　　* existing index in sync.　Mismatches indicate one of:
　　
　　* (a) old documents to be deleted;
　　
　　* (b) unchanged documents, to be left alone;
　　
　　* or (c) new documents, to be indexed.
　　
　　*/　 private void indexDocs(File file, String index, boolean create)
　　
　　throws Exception {
　　
　　if (!create) {
　　
　　// incrementally update
　　
　　reader = IndexReader.open(index);
　　
　　// open existing index
　　
　　uidIter = reader.terms(new Term("uid", ""));
　　
　　// init uid iterator
　　
　　this.indexDocs(file);
　　
　　if (deleting) {
　　
　　// delete rest of stale docs
　　
　　while (uidIter.term() != null && uidIter.term().field() == "uid") {
　　
　　System.out.println("deleting " +
　　
　　HTMLDocument.uid2url(uidIter.term().text()));
　　
　　reader.delete(uidIter.term());
　　
　　uidIter.next();
　　
　　}
　　
　　deleting = false;
　　
　　}
　　
　　uidIter.close();
　　
　　// close uid iterator
　　
　　reader.close();
　　
　　// close existing index
　　
　　} else
　　
　　// don't have exisiting
　　
　　this.indexDocs(file);
　　
　　}
　　
　　/**
　　
　　* if create=true, create a new index, else refresh old index.
　　
　　* @param create
　　
　　*/ public void run(boolean create)
　　
　　{
　　
　　try {
　　
　　String index = "index";
　　
　　File root = null;
　　
　　if (this.IndexFilePath!=null)
　　
　　{
　　
　　// index file path
　　
　　index = this.IndexFilePath;
　　
　　}
　　
　　if (this.DocsPath==null){
　　
　　System.out.println("root directory is not set");
　　
　　return;
　　
　　}
　　
　　root = new File(this.DocsPath);
　　
　　Date start = new Date();
　　
　　/**
　　
　　* not create then maintenance
　　
　　*/
　　
　　if (!create) {
　　
　　// delete stale docs
　　
　　this.deleting = true;
　　
　　this.indexDocs(root, index, create);
　　
　　}
　　
　　writer = new IndexWriter(index, new StandardAnalyzer(), create);
　　
　　writer.maxFieldLength = 1000000;
　　
　　this.indexDocs(root, index, create);
　　
　　// add new docs
　　
　　System.out.println("Optimizing index...");
　　
　　writer.optimize();
　　
　　writer.close();
　　
　　Date end = new Date();
　　
　　System.out.print(end.getTime() - start.getTime());
　　
　　System.out.println(" total milliseconds");
　　
　　} catch (Exception e) {
　　
　　System.out.println(" caught a " + e.getClass() +
　　
　　"\n with message: " + e.getMessage());
　　
　　}
　　
　　return;
　　
　　}
　　
　　/**
　　
　　* @return Returns the IndexFilePath.
　　
　　*/ public String getIndexFilePath() {　return IndexFilePath;
　　
　　}
　　
　　/**
　　
　　* @param IndexFilePath The IndexFilePath to set.
　　
　　*/ public void setIndexFilePath(String property1) {　this.IndexFilePath = property1;
　　
　　}
　　
　　/**
　　
　　* @return Returns the DocsPath.
　　
　　*/ public String getDocsPath() {　return DocsPath;
　　
　　}
　　
　　/**
　　
　　* @param DocsPath The DocsPath to set.
　　
　　*/ public void setDocsPath(String property1) {　this.DocsPath = property1;
　　
　　}
　　
　　/**
　　
　　* test
　　
　　* @param args
　　
　　*/ public static void main(String[] args){　IndexHTML ih=new IndexHTML();
　　
　　ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
　　
　　ih.setIndexFilePath("D:\\MyProject\\colimas\\index");　ih.run(true); }}
　　
　　运行后生成3个文件_3i8.cfs，deletable，segments
　　
　　搜索文件类：
　　
　　/*
　　
　　* Created on 2005/07/28
　　
　　*
　　
　　* TODO To change the template for this generated file go to
　　
　　* Window - Preferences - Java - Code Style - Code Templates
　　
　　*/package com.nova.colimas.search.query;
　　
　　/** * @author tyrone * * TODO To change the template for this generated type comment go to
　　
　　* Window - Preferences - Java - Code Style - Code Templates
　　
　　*/public class HitsHTMLDoc {　private String Title;
　　
　　priva

查看本文来源

VIP专区

VIP用户

普通用户

邮件订阅

如果您非常迫切的想了解IT领域最新产品与技术信息，那么订阅至顶网技术邮件将是您的最佳途径之一。

重磅专题

往期文章

业界热点:

数字化转型东数西算自动驾驶智能制造云计算元宇宙虚拟人物联网算力网络数字孪生人工智能区块链开源大数据