Hi - this is great news, thanks. I ran this script (below), and my
required phrase was "social media", but it still seemed to include
refs to pages that did not contain this phrase in the page text. Am I
doing something wrong?: -
# urlforest1.py
from urlnet.urltree import UrlTree
import re # for search flags
net = UrlTree(_useHostNameForDomainName = True,_maxLevel=1)
incl_patternlist = [
# new words or phrases could be added by copying and modifying
this line.
'social media',
]
net.SetProperty('include_patternlist',incl_patternlist)
urlforest = (
'list_of_urls',)
ignorableText = \
['list_of_text_to_ignore',]
net.SetIgnorableText(ignorableText)
success = net.BuildUrlForest(Urls=urlforest)
if success:
net.WritePajekFile('urlforest', 'urlforest')
net.WritePajekNetworkFile('urltree2', 'urltree2domains', urlNet =
False)
net.WritePairNetworkFile('urltree1',
'urltree1domains',
urlNet = False, # do domains instead
uniquePairs = True,
delimiter = ' ')
> urlnetandexamples.zip
> 299KViewDownload- Hide quoted text -