Hi,
One more question. This is what I am using in searchengine1.py
(below). I want it to exclude all self references to Google, but the
output still contains Google domains - am I doing something wrong?: -
# searchengine1.py
import sys
import os
from urlnet.googletree import GoogleTree
import urlnet.log
from urlnet.clickprobabilities import
probabilityByPositionStopSmokingClicks
from urlnet.searchenginetree import
computeDescendingStraightLineProbabilityVector,\
computeEqualProbabilityVector
from urlnet.urlutils import GetTimestampString
# textToIgnore = ['google',''doubleclick',]
ignorableText = ['google','doubleclick',]
def main():
# uncomment one of the vectorGenerator assignments below
# vectorGenerator = computeEqualProbabilityVector
vectorGenerator = computeDescendingStraightLineProbabilityVector
"""
We are going to make a subdirectory under
the working directory that will be different each run.
"""
from urlnet.urlutils import GetConfigValue
from os.path import join
baseDir = GetConfigValue('workingDir')
# make unique directory to write to
timestamp = GetTimestampString()
workingDir = join(baseDir,timestamp)
oldDir = os.getcwd()
myLog = None
try:
try:
os.mkdir(baseDir)
except Exception, e:
pass #TODO: make sure it's because the dir already exists
try:
os.mkdir(workingDir)
except Exception, e:
pass #TODO: make sure it's because the dir already exists
os.chdir(workingDir)
myLog = urlnet.log.Log('main')
urlnet.log.logging=True
#log.trace=True
urlnet.log.altfd=open('searchengine1.log','w')
except Exception,e:
myLog.Write(str(e)+'\n')
goAhead = False
net = GoogleTree(_maxLevel=1,
_workingDir=workingDir,
_resultLimit=100,
_probabilityVector =
probabilityByPositionStopSmokingClicks,
_probabilityVectorGenerator = vectorGenerator)
"""
# workaround for bug in searchenginetree: always expects
ignorable
text
net.SetIgnorableText([])
# uncomment these lines if you want to see what results the top-
level query returns.
##################################################
######## get and view the result set URLs ########
##################################################
(queryURL,url,Urls) = net.GetSEResultSet('"true blood"')
print queryURL
print Urls
"""
"""
# comment out the lines below (from
BuildUrlForestWithPhantomRoot
through the WriteGuessFile calls) if you just want to see
# the result set and have activated the above lines.
"""
#########################################################
######## build a forest and output some networks ########
#########################################################
net.BuildUrlForestWithPhantomRoot('"true blood"')
#net.SetProperty('getTitles',True)
net.WritePajekFile('searchengine1','searchengine1' \
#,useTitles=True \
)
net.WriteGuessFile('searchengine1_urls'\
#,useTitles=True\
) # url network
net.WriteGuessFile('searchengine1_domains',False \
#,useTitles=True \
) #domain network
# tidy up
if urlnet.log.altfd:
urlnet.log.altfd.close()
urlnet.log.altfd = None
os.chdir(oldDir)
if __name__ == '__main__':
main()
sys.exit(0)