>>> import warc
>>> import requests
>>> from contextlib import closing
>>> from StringIO import StringIO
>>>
>>> def get_partial_warc_file(url, num_bytes=1024 * 10):
... """
... Download the first part of a WARC file and return a warc.WARCFile instance.
...
... url: the url of a gzipped WARC file
... num_bytes: the number of bytes to download. Default is 10KB
...
... return: warc.WARCFile instance
... """
... with closing(requests.get(url, stream=True)) as r:
... buf = StringIO(r.raw.read(num_bytes))
... return warc.WARCFile(fileobj=buf, compress=True)
...
>>> urls = {
... }
>>>
>>> files = {file_type: get_partial_warc_file(url=url) for file_type, url in urls.items()}
>>> # this line can be used if you want to download the whole file
... # files = {file_type: warc.open(url) for file_type, url in urls.items()}
...
>>> def get_record_with_header(warc_file, header, value):
... for record, _, _ in warc_file.browse():
... if record.header.get(header) == value:
... return record
...
>>> warc_record = get_record_with_header(
... files['warc'],
... header='WARC-Type',
... value='response'
... )
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "<stdin>", line 2, in get_record_with_header
File "/Library/Python/2.7/site-packages/warc/warc.py", line 295, in browse
for record in self.reader:
File "/Library/Python/2.7/site-packages/warc/warc.py", line 390, in __iter__
record = self.read_record()
File "/Library/Python/2.7/site-packages/warc/warc.py", line 367, in read_record
fileobj = self.fileobj.read_member()
File "/Library/Python/2.7/site-packages/warc/gzip2.py", line 104, in read_member
BaseGzipFile._read(self, 1)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/gzip.py", line 303, in _read
self._read_gzip_header()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/gzip.py", line 197, in _read_gzip_header
raise IOError, 'Not a gzipped file'
IOError: Not a gzipped file <<< - - - Error here
>>> wat_record = get_record_with_header(
... files['wat'],
... header='WARC-Refers-To',
... value=warc_record.header['WARC-Record-ID']
... )
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
NameError: name 'warc_record' is not defined
>>>
>>> wet_record = get_record_with_header(
... files['wet'],
... header='WARC-Refers-To',
... value=warc_record.header['WARC-Record-ID']
... )
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
NameError: name 'warc_record' is not defined
>>>
- - - - - -