Was able to figure out myself.
The key is to go for mechanize!
Mechanize itself manages the cookies , behave just like a browser and gives a lot of power.
Beautiful Soup and Mechanize make a superb and very very powerful combination.
If you want to have a look at my code, let me know and i shall send it to you.
Thanks,
Sehaj
On Wed, May 23, 2012 at 7:14 PM, Akira Kir
<4k1r4...@gmail.com> wrote:
I'm currently doing almoust the same thing so i think i can give you a direction how to do it.
My current code looks like this:
#!/usr/bin/env python2
#-*- coding: utf-8 -*-
__author__ = 'Alex Kir'
__version__ = '0.6'
import urllib
import urllib2
import cookielib
import sys
from bs4 import BeautifulSoup
url = 'https://urlhere.com'
def login(uid, password, auth_field):
opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))]
login_data = urllib.urlencode({
'uid':uid,
'password':password,
})
opener.open(url)
ret = opener.open(url+'login.asp',
login_data)
return ret
def fetch(object):
resp = object
resp = opener.open(url+'Activity.aspx')
allData = resp.read()
return allData
def parse(data):
soup = BeautifulSoup(data, from_encoding='utf-8')
data_tabs = soup.find('table',{'id':'ctlActivityTable'})
records =[]
for row in data_tabs.find_all('tr'):
col = row.find_all('td')
records.append(col)
# cnt = 1
with open('Leumi.out','w') as f:
for col in records:
for td in col:
#print td.get(key='class')
if td.get(key='class')[0] not in 'HiddenColumn':
f.write('#'+td.get_text().strip().encode('utf-8'))
f.write('\n')
def main(uid, password):
a = login(uid, password)
b = fetch(a)
parse(b)
if __name__ == '__main__':
uid = sys.argv[1]
password = sys.argv[2]
cj = cookielib.CookieJar()
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)
main(uid, password, auth_field)
Hope that will help :)