The search result can be customized to highlight the phrases that contain the requested keyword. The following code uses “Highlighter” class from Pylucene. We emit result in HTML formatted syntax.
from lucene import \ QueryParser, IndexSearcher, IndexReader, StandardAnalyzer, \ TermPositionVector, SimpleFSDirectory, File, SimpleSpanFragmenter, Highlighter, \ QueryScorer, StringReader, SimpleHTMLFormatter, \ VERSION, initVM, Version import sys FIELD_CONTENTS = "contents" FIELD_PATH = "path" #QUERY_STRING = "lucene and restored" QUERY_STRING = sys.argv[1] STORE_DIR = "/home/kanaujia/lucene_index" if __name__ == '__main__': initVM() print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter(); query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score) # Set the fragment size. We break text in to fragment of 64 characters fragmenter = SimpleSpanFragmenter(query_score, 64); highlighter.setTextFragmenter(fragmenter); for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get(FIELD_CONTENTS) ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text)) print doc.get(FIELD_PATH) print highlighter.getBestFragments(ts, text, 3, "...") print ""
The code is an extension of search code discussed in Part-II.
We create a HTML formatter with SimpleHTMLFormatter. We create a QueryScorer to iterate over resulting documents in non-decreasing doc ID.
HighlightFormatter = SimpleHTMLFormatter();
query_score = QueryScorer (query)
highlighter = Highlighter(HighlightFormatter, query_score)
We break the text content into 64 bytes character set.
fragmenter = SimpleSpanFragmenter(query_score, 64);
highlighter.setTextFragmenter(fragmenter);
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
text = doc.get(FIELD_CONTENTS)
ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text))
print doc.get(FIELD_PATH)
Now we set number of lines for phrases in a document.
print highlighter.getBestFragments(ts, text, 3, “…”)
Results
kanaujia@ubuntu:~/work/Py/pylucy2/pylucy$ python searcher_highlight.py hello lucene 3.6.1 50 total matching documents. /home/kanaujia/Dropbox/PyConIndia/fsMgr/root/hello hi hello /home/kanaujia/Dropbox/PyConIndia/fsMgr.v4/root/hello hi hello /home/kanaujia/Dropbox/.dropbox.cache/2012-09-27/hello (deleted 505bda23-9-e8756a51) hi hello /home/kanaujia/Dropbox/PyConIndia/fsMgr.v1/root/hello.html Hello htmls {% module Hello() %}