This post is slightly off-topic here, sorry for that.
One of the problem GAE users face is how to run periodic/cpu intensive
tasks. Since GAE limits each request to 10secs, most can not run tasks
that take longer than 10 secs. And periodic jobs are not supported by
GAE. One of the suggested method is to call an GAE url periodically
from an external service/box.
Here is the work around to do cron without any external service/box
support. This library provides two functions timer and loop.
timer allows a callback function to be called periodic intervals. each
time a different http request is used to call the callback function.
loop allows a callback function to be called on a every member of
list. each callback is called in a different http request. There is no
limit on no of elements on list.
So using this library i wrote a program to fetch list of urls at every
one hour. Here is the complete code.
I think programs like crawlers or google news kind of sites can be
built with this library.
Warning: This makes heavy use of urlfetch and maximum no urlfetch per
day is 160000, so be careful not to reach that limit. You will get
many errors saying "deadline exceed" and that is expected.
So here is the complete code. To start the program point your browser
to http://<app-id>.
appspot.com/start and to stop the program point
your browser to http://<app-id>.
appspot.com/stop
#!/usr/bin/env python
import os
from cStringIO import StringIO
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext.webapp import template
from google.appengine.api import urlfetch
from google.appengine.api import memcache
import time
import logging
import md5
# library code
def mysleep(r):
time.sleep(3)
def geturl(url):
try:
urlfetch.fetch(url)
except :
pass
def geturlkey(url):
n=md5.md5()
n.update(url)
return n.hexdigest()
def timer(func, interval):
timerlist = memcache.get('timer')
if(None == timerlist):
timerlist = []
timerlist.append({'func':func, 'interval':interval})
memcache.set('timer-'+func, '1', interval)
memcache.set('timer', timerlist)
def loop(func, args):
looplist = memcache.get('loop')
if(None == looplist):
looplist = []
looplist.append({'func':func, 'args':args})
memcache.set('loop', looplist)
def handletimer(host, uindex):
timerlist = memcache.get('timer')
if(None == timerlist):
return False
current = None
for index in range(uindex, len(timerlist)):
if(None == memcache.get('timer-'+timerlist[index]['func'])):
current = timerlist[index]
break
if(current == None):
for index in range(0, uindex):
if(None == memcache.get('timer-'+timerlist[index]['func'])):
current = timerlist[index]
break
if(current is not None):
memcache.set('timer-'+current['func'], '1', current['interval'])
try:
eval(current['func']+'()')
except:
pass
if((index+1) == len(timerlist)):
index = -1
geturl(host+'/next?t=t&i='+str(index+1))
return True
else:
return False
def handleloop(host, uindex):
looplist = memcache.get('loop')
if(None == looplist):
return False
if(len(looplist) > 0):
arg = looplist[0]['args'].pop(0)
func = looplist[0]['func']
if(len(looplist[0]['args']) == 0):
looplist.pop(0)
if((len(looplist) > 0) and (len(looplist[0]['args']) > 0)):
memcache.set('loop', looplist)
else:
memcache.delete('loop')
try:
eval(func+'('+repr(arg)+')')
except:
pass
geturl(host+'/next?t=l&i='+str(uindex+1))
return True
else:
return False
class MainPage(webapp.RequestHandler):
def get(self):
self.response.out.write('hello world')
class StartPage(webapp.RequestHandler):
def get(self):
data = memcache.get('status')
if(data == 'running'):
self.response.out.write("fail")
return
memcache.set('status', 'running')
memcache.delete('timer')
memcache.delete('loop')
startfunction()
geturl("http://"+self.request.headers["HOST"]+'/task')
self.response.out.write("ok")
class StopPage(webapp.RequestHandler):
def get(self):
memcache.set('status', 'stop')
self.response.out.write("ok")
class TimerPage(webapp.RequestHandler):
def get(self):
self.response.out.write("ok")
if not ('running' == memcache.get('status')):
return
index = int(self.request.get('i', '0'))
if(False == handletimer("http://"+self.request.headers["HOST"],
index)):
retry = int(self.request.get('r', '0'))
mysleep(retry)
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))
class TaskPage(webapp.RequestHandler):
def get(self):
self.response.out.write("ok")
if not ('running' == memcache.get('status')):
return
if(False == handleloop("http://"+self.request.headers["HOST"], 0)):
if(False == handletimer("http://"+self.request.headers["HOST"],
0)):
retry = int(self.request.get('r', '0'))
mysleep(retry)
geturl("http://"+self.request.headers["HOST"]+'/sleep?r='+str(retry
+1))
class LoopPage(webapp.RequestHandler):
def get(self):
self.response.out.write("ok")
if not ('running' == memcache.get('status')):
return
index = int(self.request.get('i', '0'))
if(False == handleloop("http://"+self.request.headers["HOST"],
index)):
if(False == handletimer("http://"+self.request.headers["HOST"],
0)):
retry = int(self.request.get('r', '0'))
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))
class NextPage(webapp.RequestHandler):
def get(self):
self.response.out.write("ok")
if not ('running' == memcache.get('status')):
return
index = int(self.request.get('i', '0'))
urlt = self.request.get('t', 'l')
url = "loop"
if(urlt == 't'):
url = "timer"
geturl("http://"+self.request.headers["HOST"]+'/'+url+'?
i='+str(index))
class SleepPage(webapp.RequestHandler):
def get(self):
self.response.out.write("ok")
if not ('running' == memcache.get('status')):
return
retry = int(self.request.get('r', '0'))
mysleep(retry)
geturl("http://"+self.request.headers["HOST"]+'/task?r='+str(retry
+1))
application = webapp.WSGIApplication([('/', MainPage),
('/start', StartPage),
('/stop', StopPage),
('/task', TaskPage),
('/loop', LoopPage),
('/timer', TimerPage),
('/next', NextPage),
('/sleep', SleepPage)],
debug=True)
def main():
run_wsgi_app(application)
if __name__ == "__main__":
main()
#user's code
#list of urls to be fetched
urllist = ['
http://www.google.com/', '
http://www.cnn.com/', 'http://
news.google.com', '
http://techmeme.com', '
http://www.yahoo.com']
def getone(url):
try:
result = urlfetch.fetch(url)
if(result.status_code == 200):
memcache.set(geturlkey(url), '1', 60*60)
except :
pass
def getallurl():
global urllist
fetchlist = []
for url in urllist:
if (memcache.get(geturlkey(url)) is None):
fetchlist.append(url)
#this is equivalent to
#for url in fetchlist: getone(url)
loop('getone', fetchlist)
def startfunction():
#function getallurl will be called every 3*60 seconds
timer('getallurl', 3*60)