from whoosh.analysis import StopFilter, CompositeAnalyzer, RegexTokenizer, LowercaseFilter
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
analyzer = StopFilter(stoplist=[], minsize=1)
expression = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
|\w+\+\+
|\$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
|\w+(?:-\w+)*
|\.\.\. # ellipsis
|(?:[.,;"'?():-_`]) # these are separate tokens; includes ], [
'''
schema = Schema(content=TEXT(stored=True, analyzer=CompositeAnalyzer(RegexTokenizer(expression=expression), LowercaseFilter(), StopFilter())))
ix = create_in("match_cpp", schema)
writer = ix.writer()
content = 'c++ wi-fi hello world'
writer.add_document(content=content)
writer.commit()
with ix.searcher() as searcher:
words = content.split()
for word in words:
query = QueryParser('content', ix.schema).parse(word)
print(query)
results = searcher.search(query)
for result in results:
print(result)
print()
```