Hi,
Hi, I am new to Scrapy and am having a little problem. After I get logged into the site, I can see that I get to the page I want, but now I need to go to the linked pages. I can't seem to get there.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from scrapy.shell import inspect_response
from dmoz.items import DmozItem
class LoginSpider(BaseSpider):
print("Now it's login me #####################")
domain_name="Prosper.com"
name = 'login'
start_urls = [
]
def parse(self, response):
print()
print("Let's Parse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
if "Incorrect username or password. Please try again." in response.body:
print ("I'm in TRUE")
self.pdf_doc_now(response)
else:
print ("I'm in FALSE")
return [FormRequest.from_response(
response,
formname="aspnetForm",
formdata={
"M$ctl00$ctl00$MainContent$MainContent$MainContent$c7$tbEmail" : "The ID",
"M$ctl00$ctl00$MainContent$MainContent$MainContent$c7$tbPwd" : "The Password"},
callback=self.after_login
)
]
print("################### And in the End###################")
def after_login(self, response):
print("I am in after login ****************************************")
#inspect_response(response)
if "Incorrect username or password. Please try again." in response.body:
print "LOGIN FAILED"
else:
print "LOGIN SUCCEED"
self.get_stmt(response)
def get_stmt(self, response):
print("I should get a statement now, what do you think??????????????????")
for stmt in self.store_stmt(response):
print("Before$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
self.pdf_doc_now(stmt)
stmt
print("After$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
print()
print("after store_stmt call")
print()
def store_stmt(self, response):
print("I'm close to the end of the line^^^^^^^^^^^^^^^^^^^^^^^^^^^")
hxs = HtmlXPathSelector(response)
print(response)
pdfLinks = hxs.select("//a[contains(@href, 'PDFFilePage')]", )
items = []
for site in pdfLinks:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('@href').extract().pop()
print ("\n")
print("The Link is:", item['link'])
item['desc'] = site.select('text()').extract()
items.append(item)
print("Before Request %$%%$%$%$%$%$%$%$%$%")
stuff=self.get_pdf_doc(item['link'], response)
print ("I'm Printing STUFF" , stuff)
print ("Before YIELD????????????????????")
print (site.select('@href').extract().pop())
yield Request(site.select('@href').extract().pop(), callback=self.parse, errback=self.failure, dont_filter=True)
print("After Request @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
print("============================I'm out of store_stmt")
def get_pdf_doc(self, item, response):
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print(item)
return [Request(item, callback=self.pdf_doc_now)]
def pdf_doc_now(self, response):
#
#
# When I get here, the response parameter contains the text of the Request
#
#
print("Even Closer ++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("parm:", response)
return response
def failure(self,response):
print("Why am I here???????????????????")
I never get to the failure function, I just never get the pages from the scraped links:
2012-01-30 12:55:47-0500 [scrapy] INFO: Scrapy 0.14.1 started (bot: dmoz)
2012-01-30 12:55:48-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, W
ebService, CoreStats, SpiderState
$$$$$$$$$$$$$$$$In the beginning there was me%%%%%%%%%%%%%%%%%%
Now it's login me #####################
2012-01-30 12:55:48-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, Downloa
dTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddlewa
re, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2012-01-30 12:55:48-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMid
dleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2012-01-30 12:55:49-0500 [scrapy] DEBUG: Enabled item pipelines:
2012-01-30 12:55:49-0500 [login] INFO: Spider opened
2012-01-30 12:55:49-0500 [login] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items
/min)
2012-01-30 12:55:49-0500 [scrapy] DEBUG: Telnet console listening on
0.0.0.0:60232012-01-30 12:55:49-0500 [scrapy] DEBUG: Web service listening on
0.0.0.0:6080mmon/login.aspx?ReturnUrl=%2fsecure%2faccount%2fcommon%2fstatements.aspx> from <GET
http://www.Prospgin.aspx?ReturnUrl=%2fsecure%2faccount%2fcommon%2fstatements.aspx> (referer: None)
()
Let's Parse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
I'm in FALSE
%2fsecure%2faccount%2fcommon%2fstatements.aspx>
e%2faccount%2fcommon%2fstatements.aspx)
I am in after login ****************************************
LOGIN SUCCEED
I should get a statement now, what do you think??????????????????
I'm close to the end of the line^^^^^^^^^^^^^^^^^^^^^^^^^^^
db5770d59d2644621&stmtdtid=2')
Before Request %$%%$%$%$%$%$%$%$%$%
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
tmtdtid=2
8e8b73b4f1db5770d59d2644621&stmtdtid=2>])
Before YIELD????????????????????
tmtdtid=2
Before$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
Even Closer ++++++++++++++++++++++++++++++++++++++++++++++++++++++
After$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
Before$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
Even Closer ++++++++++++++++++++++++++++++++++++++++++++++++++++++
70d59d2644621&stmtdtid=2>)
After$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
After Request @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
============================I'm out of store_stmt
()
after store_stmt call
()
2012-01-30 12:55:52-0500 [login] INFO: Closing spider (finished)
2012-01-30 12:55:52-0500 [login] INFO: Dumping spider stats:
{'downloader/request_bytes': 3408,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 82318,
'downloader/response_count': 4,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/302': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2012, 1, 30, 17, 55, 52, 168000),
'request_depth_max': 1,
'scheduler/memory_enqueued': 4,
'start_time': datetime.datetime(2012, 1, 30, 17, 55, 49, 27000)}
2012-01-30 12:55:52-0500 [login] INFO: Spider closed (finished)
2012-01-30 12:55:52-0500 [scrapy] INFO: Dumping global stats:
{}
So, my question is "How do I get Scrapy to actually process the request from the link?"
Thanks,
Tom