Hello, I have an interesting problem. As I'm crawling through the different pages,
I'd like to strip out the HTML tags and just have the text that's associated with
the pages.
This is my mspider_spiders.py in spiders directory:
from mspider.html_strip import MLStripper
.............
def parse_pages(self, response):
hxs = HtmlXPathSelector(response)
html_strip = MLStripper()
item = MspiderItem()
html_strip.feed(hxs.select("//a/@href").extract())
item['links'] = html_strip.get_data()
html_strip.feed(hxs.select("//p").extract())
item['paragraph'] = html_strip.get_data()
html_strip.feed(hxs.select("//div").extract())
item['div'] = html_strip.get_data()
html_strip.feed(hxs.select("//span").extract())
item['span'] = html_strip.get_data()
#item['links'] = item.feed(hxs.select("//a/@href").extract()).get_data()
#item['paragraph'] = item.feed(hxs.select("//p").extract()).get_data()
#item['div'] = item.feed(hxs.select("//div").extract()).get_data()
#item['span'] = item.feed(hxs.select("//span").extract()).get_data()
return item
And this is my MLStripper class in the parent directory stored as html_strip.py:
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
#def strip_tags(html):
# s = MLStripper()
# s.feed(html)
# return s.get_data()
Now, I'm getting an error while running this code (shown at the bottom of the e-mail), but also, I'm thinking that my approach to this problem might have been done and accomplished many times before and I'd like to know how others have done this.
....................................
2013-06-25 13:37:04+0000 [mspider] ERROR: Spider error processing <GET
http://www.xbox.com:80/en-US/>
Traceback (most recent call last):
File "/usr/lib/python2.6/site-packages/Twisted-13.0.0-py2.6-linux-x86_64.egg/twisted/internet/base.py", line 1201, in mainLoop
self.runUntilCurrent()
File "/usr/lib/python2.6/site-packages/Twisted-13.0.0-py2.6-linux-x86_64.egg/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **
call.kw)
File "/usr/lib/python2.6/site-packages/Twisted-13.0.0-py2.6-linux-x86_64.egg/twisted/internet/defer.py", line 380, in callback
self._startRunCallbacks(result)
File "/usr/lib/python2.6/site-packages/Twisted-13.0.0-py2.6-linux-x86_64.egg/twisted/internet/defer.py", line 488, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/lib/python2.6/site-packages/Twisted-13.0.0-py2.6-linux-x86_64.egg/twisted/internet/defer.py", line 575, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/azureuser/scrapy_projects/mspider/mspider/spiders/mspider_spiders.py", line 60, in parse_pages
html_strip.feed(hxs.select("//a/@href").extract())
File "/usr/lib64/python2.6/HTMLParser.py", line 107, in feed
self.rawdata = self.rawdata + data
exceptions.TypeError: cannot concatenate 'str' and 'list' objects
2013-06-25 13:37:04+0000 [mspider] INFO: Closing spider (finished)
2013-06-25 13:37:04+0000 [mspider] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13750,
'downloader/request_count': 52,
'downloader/request_method_count/GET': 52,
'downloader/response_bytes': 1655760,
'downloader/response_count': 52,
'downloader/response_status_count/200': 49,
'downloader/response_status_count/302': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 6, 25, 13, 37, 4, 821748),
'log_count/DEBUG': 60,
'log_count/ERROR': 49,
'log_count/INFO': 4,
'response_received_count': 49,
'scheduler/dequeued': 52,
'scheduler/dequeued/memory': 52,
'scheduler/enqueued': 52,
'scheduler/enqueued/memory': 52,
'spider_exceptions/TypeError': 49,
'start_time': datetime.datetime(2013, 6, 25, 13, 36, 58, 933191)}
2013-06-25 13:37:04+0000 [mspider] INFO: Spider closed (finished)