关于使用Filter减少Lucene tf idf打分计算的调研
时间:2014-04-27 19:05:43
收藏:0
阅读:714
将query改成filter,lucene中有个QueryWrapperFilter性能比较差,所以基本上都需要自己写filter,包括TermFilter,ExactPhraseFilter,ConjunctionFilter,DisjunctionFilter。
这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。
最令人不解的是and,原来以为跟结果数和子查询数相关,但几次测试基本都是下降。
附ExactPhraseFilter和ut代码:
import java.io.IOException;
import java.util.ArrayList;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
// A fake to lucene phrase query, but far simplified.
public class ExactPhraseFilter extends Filter {
protected final ArrayList<Term> terms = new ArrayList<Term>();
protected final ArrayList<Integer> positions = new ArrayList<Integer>();
protected String fieldName;
public void add(Term term) {
if (terms.size() == 0) {
fieldName = term.field();
} else {
assert fieldName == term.field();
}
positions.add(Integer.valueOf(terms.size()));
terms.add(term);
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException
{
return new ExactPhraseDocIdSet(context, acceptDocs);
}
static class PostingAndFreq implements Comparable<PostingAndFreq> {
DocsAndPositionsEnum posEnum;
int docFreq;
int position;
boolean useAdvance;
int posFreq = 0;
int pos = -1;
int posTime = 0;
public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) {
this.posEnum = posEnum;
this.docFreq = docFreq;
this.position = position;
this.useAdvance = useAdvance;
}
@Override
public int compareTo(PostingAndFreq other) {
if (docFreq != other.docFreq) {
return docFreq - other.docFreq;
}
if (position != other.position) {
return position - other.position;
}
return 0;
}
}
protected class ExactPhraseDocIdSet extends DocIdSet {
protected final AtomicReaderContext context;
protected final Bits acceptDocs;
protected final PostingAndFreq[] postings;
protected boolean noDocs = false;
public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
this.context = context;
this.acceptDocs = acceptDocs;
Terms fieldTerms = context.reader().fields().terms(fieldName);
// TermContext states[] = new TermContext[terms.size()];
postings = new PostingAndFreq[terms.size()];
TermsEnum te = fieldTerms.iterator(null);
for (int i = 0; i < terms.size(); ++i) {
final Term t = terms.get(i);
// states[i] = TermContext.build(context, terms.get(i), true);
// final TermState state = states[i].get(context.ord);
if (!te.seekExact(t.bytes(), true)) {
noDocs = true;
return;
}
if (i == 0) {
postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false);
} else {
postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq);
}
}
ArrayUtil.mergeSort(postings);
for (int i = 1; i < terms.size(); ++i) {
postings[i].posEnum.nextDoc();
}
}
@Override
public DocIdSetIterator iterator() throws IOException
{
if (noDocs) {
return EMPTY_DOCIDSET.iterator();
} else {
return new ExactPhraseDocIdSetIterator(context, acceptDocs);
}
}
protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator {
protected int docID = -1;
public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException {
}
@Override
public int nextDoc() throws IOException {
while (true) {
// first (rarest) term
final int doc = postings[0].posEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
// System.err.println("END");
return docID = doc;
}
// non-first terms
int i = 1;
while (i < postings.length) {
final PostingAndFreq pf = postings[i];
int doc2 = pf.posEnum.docID();
if (pf.useAdvance) {
if (doc2 < doc) {
doc2 = pf.posEnum.advance(doc);
}
} else {
int iter = 0;
while (doc2 < doc) {
if (++iter == 50) {
doc2 = pf.posEnum.advance(doc);
} else {
doc2 = pf.posEnum.nextDoc();
}
}
}
if (doc2 > doc) {
break;
}
++i;
}
if (i == postings.length) {
// System.err.println(doc);
docID = doc;
// return docID;
if (containsPhrase()) {
return docID;
}
}
}
}
@Override
public int advance(int target) throws IOException {
throw new IOException();
}
private boolean containsPhrase() throws IOException {
int index = -1;
int i = 0;
PostingAndFreq pf;
// init.
for (i = 0; i < postings.length; ++i) {
postings[i].posFreq = postings[i].posEnum.freq();
postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position;
postings[i].posTime = 1;
}
while (true) {
pf = postings[0];
// first term.
while (pf.pos < index && pf.posTime < pf.posFreq) {
pf.pos = pf.posEnum.nextPosition() - pf.position;
++pf.posTime;
}
if (pf.pos >= index) {
index = pf.pos;
} else if (pf.posTime == pf.posFreq) {
return false;
}
// other terms.
for (i = 1; i < postings.length; ++i) {
pf = postings[i];
while (pf.pos < index && pf.posTime < pf.posFreq) {
pf.pos = pf.posEnum.nextPosition() - pf.position;
++pf.posTime;
}
if (pf.pos > index) {
index = pf.pos;
break;
}
if (pf.pos == index) {
continue;
}
if (pf.posTime == pf.posFreq) {
return false; } } if (i == postings.length) { return true; } } } @Override public int docID() { return docID; } } } }
UT:
import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.testng.annotations.AfterTest; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import com.dp.arts.lucenex.codec.Dp10Codec; public class ExactPhraseFilterTest { final Directory dir = new RAMDirectory(); @BeforeTest public void setUp() throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME)); IndexWriter writer = new IndexWriter(dir, iwc); addDocument(writer, "新疆烧烤"); // 0 addDocument(writer, "啤酒"); // 1 addDocument(writer, "烤烧"); // 2 addDocument(writer, "烧烧烧"); // 3 addDocument(writer, "烤烧中华烧烤"); // 4 writer.close(); } private void addDocument(IndexWriter writer, String str) throws IOException { Document doc = new Document(); doc.add(new TextField("searchkeywords", str, Store.YES)); writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40)); } @AfterTest public void tearDown() throws IOException { this.dir.close(); } @Test public void test1() throws IOException { IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); ExactPhraseFilter pf = new ExactPhraseFilter(); pf.add(new Term("searchkeywords", "烧")); pf.add(new Term("searchkeywords", "烤")); Query query = new ConstantScoreQuery(pf); TopDocs results = searcher.search(query, 20); assert results.totalHits == 2; assert results.scoreDocs[0].doc == 0; assert results.scoreDocs[1].doc == 4; searcher.getIndexReader().close(); } }
评论(0)