--
You received this message because you are subscribed to the Google Groups "pyspider-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to pyspider-user...@googlegroups.com.
To post to this group, send email to pyspide...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/pyspider-users/03042922-e5da-4b41-889e-2f5d1e489044%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
'''
pyspider结果保存到数据库简单样例。
使用方法:
1,把本文件放到pyspider/pyspider/database/mysql/目录下命名为mysqldb.py。
2,修改本文件的数据库配置参数及建立相应的表和库。
3,在脚本文件里使用from pyspider.database.mysql.mysqldb import SQL引用本代码.
4,重写on_result方法,实例化sql并调用replace(replace方法参数第一个是表名,第二个是结果。)。简单例子如下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-01-26 13:12:04
# Project: jianke
from pyspider.libs.base_handler import *
from pyspider.database.mysql.mysqldb import SQL
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.test .com/', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('p.pic a[href^="http"]').items():
print each.attr.href
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('HTML>BODY#presc>DIV.main>DIV.prices_box.wid980.clearfix>DIV.detail_box>DL.assort.tongyong>DD>A').text(),
}
def on_result(self, result):
#print result
if not result or not result['title']:
return
sql = SQL()
sql.replace('info',**result)
'''
from six import itervalues
import mysql.connector
from datetime import date, datetime, timedelta
class SQL:
username = 'pyspider' #数据库用户名
password = 'pyspider' #数据库密码
database = 'result' #数据库
host = 'localhost' #数据库主机地址
connection = ''
connect = True
placeholder = '%s'
def __init__(self):
if self.connect:
SQL.connect(self)
def escape(self,string):
return '`%s`' % string
def connect(self):
config = {
'user':SQL.username,
'password':SQL.password,
'host':SQL.host
}
if SQL.database != None:
config['database'] = SQL.database
try:
cnx = mysql.connector.connect(**config)
SQL.connection = cnx
return True
except mysql.connector.Error as err:
if (err.errno == errorcode.ER_ACCESS_DENIED_ERROR):
print "The credentials you provided are not correct."
elif (err.errno == errorcode.ER_BAD_DB_ERROR):
print "The database you provided does not exist."
else:
print "Something went wrong: " , err
return False
def replace(self,tablename=None,**values):
if SQL.connection == '':
print "Please connect first"
return False
tablename = self.escape(tablename )
if values:
_keys = ", ".join(self.escape(k) for k in values)
_values = ", ".join([self.placeholder, ] * len(values))
sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values)
else:
sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename
cur = SQL.connection.cursor()
try:
if values:
cur.execute(sql_query, list(itervalues(values)))
else:
cur.execute(sql_query)
SQL.connection.commit()
return True
except mysql.connector.Error as err:
print ("An error occured: {}".format(err))
return False
[D 150202 14:01:32 scheduler:252] ignore newtask chinaunix_blog:688202e08759ba590a7c0c2446a7be99 http://blog.chinaunix.net/uid/30098782.html
[D 150202 14:01:32 scheduler:252] ignore newtask chinaunix_blog:f757e8e6e02d0dcc44c1ddb7b3c2a86b http://blog.chinaunix.net/uid/30097826.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:395cc91920f3284d3b5310799a2a9f2a http://blog.chinaunix.net/uid/30057524.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:1ed0c83ecc15e23cc7e3e8b1da2ec62d http://blog.chinaunix.net/uid/30088444.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:db108cade096c2a09616f5da100401ea http://blog.chinaunix.net/uid/30107309.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:d643508a4477b1004a0cbfa8f69912cf http://blog.chinaunix.net/uid/1827018.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:3cd56e2c05358b3244a729ec5616f5ff http://blog.chinaunix.net/uid-7374279-id-4813735.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:95673ec2a7cf60be5e14abd2064423ee http://blog.chinaunix.net/uid-301743-id-4813354.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:6433631ba71c45d92d5a624dedc0982a http://blog.chinaunix.net/uid-13328506-id-4809491.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:22d9f9a2cdfdaf3d42ee94061c7ea28c http://blog.chinaunix.net/uid-14528823-id-4808877.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:5613a37cc5bae7aae01f33e9558fbc88 http://blog.chinaunix.net/uid-509190-id-4807958.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:e6119897ffaed66a440d5766dc22645f http://blog.chinaunix.net/uid-24780853-id-4425130.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:84ca467684da257eff942c3afcb79658 http://blog.chinaunix.net/uid-24780853-id-4425129.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:0bc550bfeef1ff816564f527d8dbe056 http://blog.chinaunix.net/uid-24780853-id-4091413.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:36054be7a85b718716b9e2123a8cbead http://blog.chinaunix.net/uid-24780853-id-4043233.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:d85f3b2d2514962578b0ebfcf85fa420 http://blog.chinaunix.net/uid-24780853-id-4043226.html
[D 150202 14:01:32 scheduler:533] ignore newtask chinaunix_blog:0d70720a030f7f8cd1ed1af844f9c330 http://blog.chinaunix.net/uid/0.html
[I 150202 14:01:36 scheduler:632] select chinaunix_blog:9a2df5be4bc757426e8d86600fb98fd2 http://blog.chinaunix.net/uid/29679056.html
[I 150202 14:01:36 tornado_fetcher:232] [200] http://blog.chinaunix.net/uid/29679056.html 0.10s
[I 150202 14:01:36 processor:153] process chinaunix_blog:9a2df5be4bc757426e8d86600fb98fd2 http://blog.chinaunix.net/uid/29679056.html -> [200] len:45943 -> result:None fol:10 msg:0 err:None
[I 150202 14:01:36 _internal:87] 106.186.112.20 - - [02/Feb/2015 14:01:36] "GET /counter?time=1d&type=sum HTTP/1.1" 200 -
[I 150202 14:01:36 _internal:87] 106.186.112.20 - - [02/Feb/2015 14:01:36] "GET /counter?time=1h&type=sum HTTP/1.1" 200 -
[I 150202 14:01:36 scheduler:581] task done chinaunix_blog:9a2df5be4bc757426e8d86600fb98fd2 http://blog.chinaunix.net/uid/29679056.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:9a2df5be4bc757426e8d86600fb98fd2 http://blog.chinaunix.net/uid/29679056.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:a8730a147bd5cc023d40aba5d18768eb http://blog.chinaunix.net/uid-24789255-id-198226.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:e3a73eaff72c29ead1a2e0e2332afae9 http://blog.chinaunix.net/uid-24789255-id-4288187.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:7d37148a6a64c5f881e26a773792c836 http://blog.chinaunix.net/uid-24789255-id-4289076.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:3cd56e2c05358b3244a729ec5616f5ff http://blog.chinaunix.net/uid-7374279-id-4813735.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:95673ec2a7cf60be5e14abd2064423ee http://blog.chinaunix.net/uid-301743-id-4813354.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:6433631ba71c45d92d5a624dedc0982a http://blog.chinaunix.net/uid-13328506-id-4809491.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:22d9f9a2cdfdaf3d42ee94061c7ea28c http://blog.chinaunix.net/uid-14528823-id-4808877.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:5613a37cc5bae7aae01f33e9558fbc88 http://blog.chinaunix.net/uid-509190-id-4807958.html
[D 150202 14:01:36 scheduler:533] ignore newtask chinaunix_blog:072494f77a2fa2375917cc7c24986991 http://blog.chinaunix.net/uid-29679056-id-4274466.html
[I 150202 14:01:37 scheduler:632] select chinaunix_blog:6d19290d4b2faed9d77dd46313e47b2c http://blog.chinaunix.net/uid/301541.html
[I 150202 14:01:37 tornado_fetcher:232] [200] http://blog.chinaunix.net/uid/301541.html 0.11s
[I 150202 14:01:37 processor:153] process chinaunix_blog:6d19290d4b2faed9d77dd46313e47b2c http://blog.chinaunix.net/uid/301541.html -> [200] len:44969 -> result:None fol:10 msg:0 err:None
[I 150202 14:01:37 scheduler:581] task done chinaunix_blog:6d19290d4b2faed9d77dd46313e47b2c http://blog.chinaunix.net/uid/301541.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:6d19290d4b2faed9d77dd46313e47b2c http://blog.chinaunix.net/uid/301541.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:a8730a147bd5cc023d40aba5d18768eb http://blog.chinaunix.net/uid-24789255-id-198226.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:e3a73eaff72c29ead1a2e0e2332afae9 http://blog.chinaunix.net/uid-24789255-id-4288187.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:7d37148a6a64c5f881e26a773792c836 http://blog.chinaunix.net/uid-24789255-id-4289076.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:3cd56e2c05358b3244a729ec5616f5ff http://blog.chinaunix.net/uid-7374279-id-4813735.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:95673ec2a7cf60be5e14abd2064423ee http://blog.chinaunix.net/uid-301743-id-4813354.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:6433631ba71c45d92d5a624dedc0982a http://blog.chinaunix.net/uid-13328506-id-4809491.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:22d9f9a2cdfdaf3d42ee94061c7ea28c http://blog.chinaunix.net/uid-14528823-id-4808877.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:5613a37cc5bae7aae01f33e9558fbc88 http://blog.chinaunix.net/uid-509190-id-4807958.html
[D 150202 14:01:37 scheduler:533] ignore newtask chinaunix_blog:c6a4e98c05e1669f2df01a2dda8aed2c http://blog.chinaunix.net/uid-301541-id-2441918.html
详细提供
{
"webui": {
"port": 5001,
"username": "admin",
"password": "123456",
"need_auth": "true"
}
}
详细提供
--
You received this message because you are subscribed to the Google Groups "pyspider-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to pyspider-user...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/pyspider-users/ccf241a5-1c4b-40ad-88ab-a8e04624e78e%40googlegroups.com.
To unsubscribe from this group and stop receiving emails from it, send an email to pyspide...@googlegroups.com.