<body class="sapUiBody" role="application">
<div id="ctrRoot"></div>
</body><body class="sapUiBody" role="application" style="margin: 0px;">
<div id="ctrRoot" data-sap-ui-area="ctrRoot">
<div id="__shell0" data-sap-ui="__shell0" class="sapDkShell sapUiUx3Shell sapUiUx3ShellDesignStandard sapUiUx3ShellFullHeightContent sapUiUx3ShellHeadStandard sapUiUx3ShellNoContentPadding">
... Lots of crap here ...
</div>
</div>
</body> <div id="demokitSplitter_secondPane" class="sapUiVSplitterSecondPane" style="overflow: hidden; width: 79.7396%;">
<iframe id="content" name="content" src="about:blank" frameborder="0" onload="sap.ui.demokit.DemokitApp.getInstance().onContentLoaded();" data-sap-ui-preserve="content">
</iframe>
</div> <div id="demokitSplitter_secondPane" class="sapUiVSplitterSecondPane" style="overflow: hidden; width: 79.7396%;">
<iframe id="content" name="content" src="about:blank" frameborder="0" onload="sap.ui.demokit.DemokitApp.getInstance().onContentLoaded();" data-sap-ui-preserve="content">
<html xml:lang="en" lang="en" data-highlight-query-terms="pending">
<body>
<div id="main">
<div id="content">
<div class="full-description">
</div>
<div class="summary section">
<div class="sectionItems">
<div class="sectionItem itemName namespace static">
<b class="icon" title="Analysis Path Framework">
<a href="test.html">test</a>
</b>
<span class="description">Analysis Path Framework</span>
</div>
<div class="sectionItem itemName namespace static">
<b class="icon" title="Test2">
<a href="test.html">test2</a>
</b>
<span class="description">Test2</span>
</div>
</div>
</div>
</div>
</div>
</body>
</html>
</iframe>
</div> <div class="sectionItems"> <div class="sectionItem itemName namespace static">
<div class="sectionItem itemName namespace static">
def _response(self, _, driver, spider):
print 'PhantomJSDownloadHandler _response writing first.html, possibly empty html (due to AJAX) %s' %(time.asctime( time.localtime(time.time()) ))
target = codecs.open('first.html', 'w', "utf-8")
target.truncate()
target.write(driver.page_source)
target.close()
try: print 'PhantomJSDownloadHandler waiting for sectionTitles %s' %(time.asctime( time.localtime(time.time()) ))
max_time_to_wait_sec = 20
time_between_polls_milli = 2
#element = WebDriverWait(driver, max_time_to_wait_sec, time_between_polls_milli).until(EC.presence_of_element_located((By.CLASS_NAME, "sectionItems")))
#element = WebDriverWait(driver, max_time_to_wait_sec).until(EC.presence_of_element_located((By.CLASS_NAME, "sapUiVSplitterSecondPane")))
#element = self.driver.find_elements_by_xpath('//div[@class="sectionItems"]')
#element = self.driver.find_elements_by_xpath('//iframe')
#WebDriverWait(self.driver,20,poll_frequency=.2).until(EC.visibility_of(element))
#WebDriverWait(self.driver,20,poll_frequency=.2).until(EC.frame_to_be_available_and_switch_to_it(By.id("content")))
WebDriverWait(self.driver,20,poll_frequency=.2).until(EC.frame_to_be_available_and_switch_to_it((By.id, "content")))
#WebDriverWait(self.driver,20,poll_frequency=.2).until(EC.visibility_of_element_located(By.CLASS_NAME, "sectionItems"))
http://stackoverflow.com/questions/25057174/scrapy-crawl-in-order
def parse(self, response):
for link in response.xpath("//article/a/@href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
# Working, finds first SectionsItems
print 'checking for <div class="sectionItems">'sectionItems = namespace.xpath(".//div[@class='summary section']/div[@class='sectionItems']")
#sections = hxs.xpath("//div[@class='sectionItem']")
#sections = hxs.xpath("//div[contains(@class, 'sectionItem itemName namespace static')]")
#sections = hxs.xpath("//<div class="sectionTitle">Namespaces & Classes</div>/div[@class='sectionItems']")
print 'xpath SectionItems:%s' %sectionItems
for sectionItem in sectionItems:
print 'Found SectionItem:'
#sections = sectionItem.xpath("div[@class='sectionItem']")
sections = sectionItem.xpath("div[re:test(@class, 'sectionItem')]")
#sections = sectionItem.xpath("div[re:test(@class, 'sectionItem itemName namespace static')]")
for section in sections:
print 'Found Section:%s' %(section.extract())--
You received this message because you are subscribed to the Google Groups "scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to scrapy-users...@googlegroups.com.
To post to this group, send email to scrapy...@googlegroups.com.
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.
You can scrape that with a little help of splash: https://github.com/scrapy-plugins/scrapy-splash
--
sudo docker run -p 5023:5023 -p 8050:8050 -p :8051 scrapinghub/splash
curl 'http://localhost:8050/render.html?url=https://sapui5.hanasdk/#docs/api/symbols/sap.html'
2016-06-03 18:45:30+0000 [-] Log opened.
2016-06-03 18:45:30.977689 [-] Splash version: 2.1
2016-06-03 18:45:30.978469 [-] Qt 5.5.1, PyQt 5.5.1, WebKit 538.1, sip 4.17, Twisted 16.1.1, Lua 5.2
2016-06-03 18:45:30.978726 [-] Python 3.4.3 (default, Oct 14 2015, 20:28:29) [GCC 4.8.4]
2016-06-03 18:45:30.979088 [-] Open files limit: 1048576
2016-06-03 18:45:30.979314 [-] Can't bump open files limit
2016-06-03 18:45:31.082894 [-] Xvfb is started: ['Xvfb', ':1', '-screen', '0', '1024x768x24']
2016-06-03 18:45:31.156718 [-] proxy profiles support is enabled, proxy profiles path: /etc/splash/proxy-profiles
2016-06-03 18:45:31.249329 [-] verbosity=1
2016-06-03 18:45:31.249597 [-] slots=50
2016-06-03 18:45:31.254768 [-] argument_cache_max_entries=500
2016-06-03 18:45:31.255229 [-] Web UI: enabled, Lua: enabled (sandbox: enabled)
2016-06-03 18:45:31.257577 [-] Site starting on 8050
2016-06-03 18:45:31.257732 [-] Starting factory <twisted.web.server.Site object at 0x7fb3b5ab7e48>
process 1: D-Bus library appears to be incorrectly set up; failed to read machine uuid: Failed to open "/etc/machine-id": No such file or directory
See the manual page for dbus-uuidgen to correct this issue.
2016-06-03 18:45:36.244542 [events] {"load": [0.29, 0.11, 0.07], "client_ip": "172.17.0.1", "path": "/render.html", "timestamp": 1464979536, "args": {"uid": 140409823866608, "url": "https://sapui5.hana.ondemand.com/sdk/"}, "active": 0, "_id": 140409823866608, "maxrss": 101772, "user-agent": "curl/7.47.0", "rendertime": 0.539576530456543, "qsize": 0, "method": "GET", "status_code": 200, "fds": 20}
2016-06-03 18:45:36.244885 [-] "172.17.0.1" - - [03/Jun/2016:18:45:35 +0000] "GET /render.html?url=https://sapui5.hana.ondemand.com/sdk/ HTTP/1.1" 200 10270 "-" "curl/7.47.0"
Ubuntu already has Python 2.7, but now we need to get Scrapy.
pip is not installed as part of Ubuntu:
sudo apt-get -y install python-pip
sudo apt-get -y install libxml2-dev libxslt1-dev libffi-dev libssl-dev python-dev
sudo pip install scrapy
Create a new blank scrapy project
root@ubuntu:/opt/scrapy# scrapy startproject ui5
New Scrapy project 'ui5', using template directory '/usr/local/lib/python2.7/dist-packages/scrapy/templates/project', created in:
/opt/scrapy/ui5
You can start your first spider with:
cd ui5
scrapy genspider example example.com
root@ubuntu:/opt/scrapy# cd ui5
root@ubuntu:/opt/scrapy/ui5# scrapy genspider sapui5 sapui5.hana.ondemand.com
Created spider 'sapui5' using template 'basic' in module:
ui5.spiders.sapui5
Now run the scrapy project with:
root@ubuntu:/opt/scrapy/ui5# scrapy crawl sapui5
-- You can see here the URL is incorrect.
2016-06-03 10:57:35 [scrapy] DEBUG: Retrying <GET http://www.sapui5.hana.ondemand.com/robots.txt> (failed 1
root@ubuntu:/opt/scrapy/ui5# vim ui5/spiders/sapui5.py
-- Update the URL to be correct
class Sapui5Spider(scrapy.Spider):
name = "sapui5"
allowed_domains = ["sapui5.hana.ondemand.com"]
start_urls = (
'https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html/',
)
def parse(self, response):
pass
root@ubuntu:/opt/scrapy/ui5# scrapy crawl sapui5
-- Success, but nothing really done, as it uses iframes and other issues.
2016-06-03 11:01:25 [scrapy] INFO: Enabled item pipelines:
[]
2016-06-03 11:01:25 [scrapy] INFO: Spider opened
2016-06-03 11:01:25 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-06-03 11:01:25 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-06-03 11:01:25 [scrapy] DEBUG: Crawled (404) <GET https://sapui5.hana.ondemand.com/robots.txt> (referer: None)
2016-06-03 11:01:25 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html/> (referer: None)
2016-06-03 11:01:25 [scrapy] INFO: Closing spider (finished)
2016-06-03 11:01:25 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 458,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 4105,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 6, 3, 18, 1, 25, 649281),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 6, 3, 18, 1, 25, 48404)}
2016-06-03 11:01:25 [scrapy] INFO: Spider closed (finished)
-- Edit the spider and update it
root@ubuntu:/opt/scrapy/ui5# vim ui5/spiders/sapui5.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
class Sapui5Spider(scrapy.Spider):
name = "sapui5"
start_urls = ['https://sapui5.hana.ondemand.com/']
def parse(self, response):
url = 'https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html'
yield SplashRequest(url, self.parse_page,
args={
'wait': 5.,
'iframes': True,
'html': True,
},
endpoint='render.json')
def parse_page(self, response):
iframe_html = response.data['childFrames'][0]['html']
sel = scrapy.Selector(text=iframe_html)
for div in sel.css('#content .sectionItem'):
name = div.css('a::text').extract_first()
desc = div.css('.description::text').extract_first() or ''
print(': '.join([name, desc]))
#class Sapui5Spider(scrapy.Spider):
# name = "sapui5"
# allowed_domains = ["sapui5.hana.ondemand.com"]
# start_urls = (
# 'https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html/',
# )
#
# def parse(self, response):
# pass
pip install scrapy-splash
root@ubuntu:/opt/scrapy/ui5/ui5# vim settings.py
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'ui5.middlewares.MyCustomDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
}
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
-- The middleware needs to take precedence over HttpProxyMiddleware, which by default is at position 750, so we set the middleware positions to numbers below 750.
root@ubuntu:/opt/scrapy/ui5/ui5/spiders# scrapy runspider sapui5.py
2016-06-03 11:54:26 [scrapy] INFO: Scrapy 1.1.0 started (bot: ui5)
2016-06-03 11:54:26 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'ui5.spiders', 'ROBOTSTXT_OBEY': True, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'SPIDER_MODULES': ['ui5.spiders'], 'BOT_NAME': 'ui5', 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage'}
2016-06-03 11:54:26 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2016-06-03 11:54:26 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy_splash.SplashCookiesMiddleware',
'scrapy_splash.SplashMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-06-03 11:54:26 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-06-03 11:54:26 [scrapy] INFO: Enabled item pipelines:
[]
2016-06-03 11:54:26 [scrapy] INFO: Spider opened
2016-06-03 11:54:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-06-03 11:54:26 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-06-03 11:54:27 [scrapy] DEBUG: Crawled (404) <GET https://sapui5.hana.ondemand.com/robots.txt> (referer: None)
2016-06-03 11:54:27 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/> (referer: None)
2016-06-03 11:54:27 [scrapy] DEBUG: Crawled (404) <GET http://localhost:8050/robots.txt> (referer: None)
2016-06-03 11:54:31 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html via http://localhost:8050/render.json> (referer: None)
2016-06-03 11:54:32 [scrapy] ERROR: Spider error processing <GET https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html via http://localhost:8050/render.json> (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/opt/scrapy/ui5/ui5/spiders/sapui5.py", line 20, in parse_page
iframe_html = response.data['childFrames'][0]['html']
IndexError: list index out of range
2016-06-03 11:54:32 [scrapy] INFO: Closing spider (finished)
2016-06-03 11:54:32 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1365,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 18165,
'downloader/response_count': 4,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/404': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 6, 3, 18, 54, 32, 163122),
'log_count/DEBUG': 5,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 4,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'spider_exceptions/IndexError': 1,
'splash/render.json/request_count': 1,
'splash/render.json/response_count/200': 1,
'start_time': datetime.datetime(2016, 6, 3, 18, 54, 26, 822252)}
2016-06-03 11:54:32 [scrapy] INFO: Spider closed (finished)
root@ubuntu:/opt/scrapy/ui5/ui5/spiders#
2016-06-03 18:42:27+0000 [-] Log opened.
2016-06-03 18:42:27.165439 [-] Splash version: 2.1
2016-06-03 18:42:27.165891 [-] Qt 5.5.1, PyQt 5.5.1, WebKit 538.1, sip 4.17, Twisted 16.1.1, Lua 5.2
2016-06-03 18:42:27.166142 [-] Python 3.4.3 (default, Oct 14 2015, 20:28:29) [GCC 4.8.4]
2016-06-03 18:42:27.166252 [-] Open files limit: 1048576
2016-06-03 18:42:27.166431 [-] Can't bump open files limit
2016-06-03 18:42:27.270648 [-] Xvfb is started: ['Xvfb', ':1', '-screen', '0', '1024x768x24']
2016-06-03 18:42:27.359173 [-] proxy profiles support is enabled, proxy profiles path: /etc/splash/proxy-profiles
2016-06-03 18:42:27.446969 [-] verbosity=1
2016-06-03 18:42:27.447204 [-] slots=50
2016-06-03 18:42:27.447408 [-] argument_cache_max_entries=500
2016-06-03 18:42:27.447715 [-] Web UI: enabled, Lua: enabled (sandbox: enabled)
2016-06-03 18:42:27.449404 [-] Site starting on 8050
2016-06-03 18:42:27.449533 [-] Starting factory <twisted.web.server.Site object at 0x7f5c8ed4ce48>
2016-06-03 18:42:33.741040 [-] "172.17.0.1" - - [03/Jun/2016:18:42:33 +0000] "GET /robots.txt HTTP/1.1" 404 153 "-" "Scrapy/1.1.0 (+http://scrapy.org)"
process 1: D-Bus library appears to be incorrectly set up; failed to read machine uuid: Failed to open "/etc/machine-id": No such file or directory
See the manual page for dbus-uuidgen to correct this issue.
2016-06-03 18:42:38.960163 [events] {"timestamp": 1464979358, "rendertime": 5.214949131011963, "_id": 140035510107888, "fds": 19, "active": 0, "client_ip": "172.17.0.1", "maxrss": 81264, "qsize": 0, "user-agent": "Scrapy/1.1.0 (+http://scrapy.org)", "load": [0.01, 0.04, 0.05], "status_code": 200, "path": "/render.json", "args": {"iframes": true, "url": "https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html", "html": true, "headers": {"User-Agent": "Scrapy/1.1.0 (+http://scrapy.org)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Referer": "https://sapui5.hana.ondemand.com/", "Accept-Language": "en", "Accept-Encoding": "gzip,deflate"}, "wait": 5.0, "uid": 140035510107888}, "method": "POST"}
2016-06-03 18:42:38.960617 [-] "172.17.0.1" - - [03/Jun/2016:18:42:38 +0000] "POST /render.json HTTP/1.1" 200 13662 "-" "Scrapy/1.1.0 (+http://scrapy.org)"
Try a real USER_AGENT setting.
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
2016-06-03 20:37:19.777710 [-] Splash version: 2.1
2016-06-03 20:37:19.781558 [-] Qt 5.5.1, PyQt 5.5.1, WebKit 538.1, sip 4.17, Twisted 16.1.1, Lua 5.2
2016-06-03 20:37:19.781864 [-] Python 3.4.3 (default, Oct 14 2015, 20:28:29) [GCC 4.8.4]
2016-06-03 20:37:19.782499 [-] Open files limit: 1048576
2016-06-03 20:37:19.782676 [-] Can't bump open files limit
2016-06-03 20:37:19.903300 [-] Xvfb is started: ['Xvfb', ':1', '-screen', '0', '1024x768x24']
2016-06-03 20:37:20.115657 [-] proxy profiles support is enabled, proxy profiles path: /etc/splash/proxy-profiles
2016-06-03 20:37:20.319444 [-] verbosity=1
2016-06-03 20:37:20.319719 [-] slots=50
2016-06-03 20:37:20.320095 [-] argument_cache_max_entries=500
2016-06-03 20:37:20.320618 [-] Web UI: enabled, Lua: enabled (sandbox: enabled)
2016-06-03 20:37:20.323905 [-] Site starting on 8050
2016-06-03 20:37:20.324129 [-] Starting factory <twisted.web.server.Site object at 0x7f279ce6fe48>
2016-06-03 20:37:24.992726 [-] "172.17.0.1" - - [03/Jun/2016:20:37:24 +0000] "GET /robots.txt HTTP/1.1" 404 153 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
process 1: D-Bus library appears to be incorrectly set up; failed to read machine uuid: Failed to open "/etc/machine-id": No such file or directory
See the manual page for dbus-uuidgen to correct this issue.
2016-06-03 20:37:35.964753 [events] {"timestamp": 1464986255, "_id": 139808112914160, "active": 0, "args": {"iframes": true, "headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "Referer": "https://sapui5.hana.ondemand.com/", "Accept-Language": "en", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip,deflate"}, "uid": 139808112914160, "wait": 10.0, "html": true, "url": "https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html"}, "maxrss": 79608, "fds": 19, "client_ip": "172.17.0.1", "path": "/render.json", "status_code": 200, "rendertime": 10.95194411277771, "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "method": "POST", "qsize": 0, "load": [0.09, 0.08, 0.06]}
2016-06-03 20:37:35.967636 [-] "172.17.0.1" - - [03/Jun/2016:20:37:35 +0000] "POST /render.json HTTP/1.1" 200 13662 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
def parse_page(self, response):
if 'response' in locals():
print('response is defined')
else:
print('Ooops: response is not defined')
return
print response
if 'response.data' in locals():
print('response.data is defined')
else:
print('Ooops: response.data is not defined')
return
print response.data
print('Len response.data:'.len(response.data))
if 'childFrames' in response.data.keys():
print('There is a childFrame')
else:
print('Ooops: no childFrames')
return
if len(response.data['childFrames']) > 0:
print('There is childFrame 0')
else:
print('Ooops: no childFrame 0')
return
print('Len first child:'.len(response.data['childFrames'][0]))
print('Len html:'.len(response.data['childFrames'][0]['html']))
iframe_html = response.data['childFrames'][0]['html']
2016-06-03 13:37:24 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/> (referer: None)
2016-06-03 13:37:24 [scrapy] DEBUG: Crawled (404) <GET http://localhost:8050/robots.txt> (referer: None)
2016-06-03 13:37:36 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html via http://localhost:8050/render.json> (referer: None)
response is defined
<200 https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html>
Ooops: response.data is not defined
2016-06-03 13:37:36 [scrapy] INFO: Closing spider (finished)
Rolando,
2016-06-03 19:43:37 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-06-03 19:43:37 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-06-03 19:43:37 [scrapy] DEBUG: Crawled (404) <GET https://sapui5.hana.ondemand.com/robots.txt> (referer: None)
2016-06-03 19:43:37 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/> (referer: None)
2016-06-03 19:43:37 [scrapy] DEBUG: Crawled (404) <GET http://localhost:8050/robots.txt> (referer: None)
2016-06-03 19:43:42 [scrapy] DEBUG: Crawled (200) <GET https://sapui5.hana.ondemand.com/sdk/docs/api/symbols/sap.html via http://localhost:8050/render.html> (referer: None)
2016-06-04 02:43:42.574895 [pool] [140619310439728] SLOT 10 done with <splash.qtrender.HtmlRender object at 0x7fe4341c00b8>
2016-06-04 02:43:42.576237 [events] {"active": 0, "path": "/render.html", "rendertime": 5.003755807876587, "maxrss": 94368, "client_ip": "172.17.0.1", "qsize": 0, "method": "POST", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "timestamp": 1465008222, "load": [0.09, 0.05, 0.05], "status_code": 200, "fds": 19, "_id": 140619310439728, "args": {"height": 768, "headers": {"Accept-Encoding": "gzip,deflate", "Referer": "https://sapui5.hana.ondemand.com/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en"}, "uid": 140619310439728, "png": 1, "iframes": 1, "wait": 5.0, "url": "https://sapui5.hana.ondemand.com/sdk/docs/api/symbols/sap.html", "http_method": "GET", "timeout": 10, "script": 1, "width": 1024, "html": 1, "console": 1}}
2016-06-04 02:43:42.576691 [-] "172.17.0.1" - - [04/Jun/2016:02:43:41 +0000] "POST /render.html HTTP/1.1" 200 1830 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
2016-06-04 02:43:42.577109 [pool] SLOT 10 is available
2016-06-04 02:45:06.550405 [pool] [140619313333752] SLOT 11 done with <splash.qtrender.HtmlRender object at 0x7fe47c038390>
2016-06-04 02:45:06.551410 [events] {"active": 0, "path": "/render.html", "rendertime": 0.7969868183135986, "maxrss": 94368, "client_ip": "172.17.0.1", "qsize": 0, "method": "GET", "user-agent": "curl/7.47.0", "timestamp": 1465008306, "load": [0.23, 0.11, 0.07], "status_code": 200, "fds": 19, "_id": 140619313333752, "args": {"height": "768", "console": "1", "iframe": "1", "uid": 140619313333752, "png": "1", "width": "1024", "wait": "0.5", "url": "https://sapui5.hana.ondemand.com/sdk/docs/api/symbols/sap.html", "timeout": "10", "script": "1", "html": "1"}}
2016-06-04 02:45:06.552238 [-] "172.17.0.1" - - [04/Jun/2016:02:45:05 +0000] "GET /render.html?url=https://sapui5.hana.ondemand.com/sdk/docs/api/symbols/sap.html&iframe=1&html=1&png=1&width=1024&height=768&script=1&console=1&timeout=10&wait=0.5 HTTP/1.1" 200 5562 "-" "curl/7.47.0"
2016-06-04 02:45:06.552681 [pool] SLOT 11 is available
def parse(self, response):
#url = 'https://sapui5.hana.ondemand.com/sdk/#docs/api/symbols/sap.html'
url = 'https://sapui5.hana.ondemand.com/sdk/docs/api/symbols/sap.html'
yield SplashRequest(url, self.parse_page,
args={
'http_method': 'GET',
'timeout': 10,
'wait': 5.,
'iframes': 1,
'html': 1,
'png': 1,
'script': 1,
'console': 1,
'width': 1024,
'height': 768,
},
endpoint='render.html')