HighFreqTerms

30 views
Skip to first unread message

panglaohu

unread,
Jan 11, 2010, 10:17:22 PM1/11/10
to Lucene 探源
package Sample;

import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.Version;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;

import java.io.File;
import java.io.IOException;
import java.util.Hashtable;

public class HighFreqTermViewer {
public static int defaultNumTerms = 6;
static final String temDir = "C:\\Users\\Administrator\\workspaceNew\
\lucene3.0\\index";
static File temDirIndexFile = new File(temDir);

public void buildIndex() throws IOException,
LockObtainFailedException,
Exception {

IndexWriter writer = new IndexWriter(FSDirectory.open
(temDirIndexFile),
new StandardAnalyzer(Version.LUCENE_CURRENT), true,
IndexWriter.MaxFieldLength.LIMITED);
writer.setUseCompoundFile(false);

Document docOne = new Document();
docOne.add(new Field("Title", "Hello lucene", Field.Store.YES,
Field.Index.ANALYZED));
docOne.add(new Field("contents", "I like Lucene,,Lucene in Action,
Second Edition, completely revises and updates the best-selling first
edition and remains the authoritative book on Lucene. This book shows
you how to index your documents, including types such as MS Word, PDF,
HTML, and XML. It introduces you to searching, sorting, and filtering,
and covers the numerous changes to Lucene since the first edition. All
source code has been updated to current Lucene 2.3APIs.",
Field.Store.NO,
Field.Index.ANALYZED));
writer.addDocument(docOne);

Document docTwo = new Document();
docTwo.add(new Field("Title", "Lucene development", Field.Store.YES,
Field.Index.ANALYZED));
docTwo.add(new Field("contents", "lucene is very hard,Apache Lucene
is a high-performance, full-featured text search engine library
written entirely in Java.", Field.Store.NO,
Field.Index.ANALYZED));
writer.addDocument(docTwo);

Document docThree = new Document();
docThree.add(new Field("Title", "Lucene deployment",
Field.Store.YES,
Field.Index.ANALYZED));
docThree.add(new Field("contents", "we use the ant,Ant in Action is
a complete guide to using Ant to build, test, redistribute and deploy
Java applications.", Field.Store.NO,
Field.Index.ANALYZED));
writer.addDocument(docThree);

writer.close();
}

public static void main(String[] args) throws Exception {
// Directory dir = FSDirectory.getDirectory(args[0]);
HighFreqTermViewer HFTV = new HighFreqTermViewer();
HFTV.buildIndex();
Directory dir = FSDirectory.open(temDirIndexFile);
TermInfo[] terms = getHighFreqTerms(IndexReader.open(dir), null,
new String[] { "contents" });
for (int i = 0; i < terms.length; i++) {
System.out.println(i + ".\t" + terms[i].term +" docFreq: " +
terms[i].docFreq);
}
}

public static TermInfo[] getHighFreqTerms(IndexReader ir,
Hashtable junkWords, String[] fields) throws Exception {
return getHighFreqTerms(ir, junkWords, defaultNumTerms, fields);
}

public static TermInfo[] getHighFreqTerms(IndexReader reader,
Hashtable junkWords, int numTerms, String[] fields)
throws Exception {
if (reader == null || fields == null)
return null;
TermInfoQueue tiq = new TermInfoQueue(numTerms);
TermEnum terms = reader.terms();

int minFreq = 0;
while (terms.next()) {
String field = terms.term().field();
if (fields != null && fields.length > 0) {
boolean skip = true;
for (int i = 0; i < fields.length; i++) {
if (field.equals(fields[i])) {
skip = false;
break;
}
}
if (skip)
continue;
}
if (junkWords != null && junkWords.get(terms.term().text()) !=
null)
continue;
if (terms.docFreq() > minFreq) {
// tiq.put(new TermInfo(terms.term(), terms.docFreq()));
tiq.add(new TermInfo(terms.term(), terms.docFreq()));
if (tiq.size() >= numTerms) // if tiq overfull
{
tiq.pop(); // remove lowest in tiq
minFreq = ((TermInfo) tiq.top()).docFreq; // reset minFreq
}
}
}
TermInfo[] res = new TermInfo[tiq.size()];
for (int i = 0; i < res.length; i++) {
res[res.length - i - 1] = (TermInfo) tiq.pop();
}
return res;
}
}

final class TermInfoQueue extends PriorityQueue {
TermInfoQueue(int size) {
initialize(size);
}

protected final boolean lessThan(Object a, Object b) {
TermInfo termInfoA = (TermInfo) a;
TermInfo termInfoB = (TermInfo) b;
return termInfoA.docFreq < termInfoB.docFreq;
}
}

Reply all
Reply to author
Forward
0 new messages