I parse some site and when I run code on my pc - I got nice html code. But when I try to run it via google app engine I got wrong encoded text like this:
o�Ƭ"3��&�C�]���lu}�j ���?��l�.�3 Y�e?�bG���c�����No�ј�}�����e�(�NK��S$T�8]I�G֤�TZְ
It's ISO-8859-1. Python can't parse on google app engine. How to fix it?
Here the video https://www.youtube.com/watch?v=MnIUk5QkHZU
Here the parser:
# -*- coding: utf-8 -*-
import requests
import lxml.html
class Rutor:
def __init__(self, title, year='', qu=''):
self.title = title
self.year = year
self.qu = qu
self.main_domain = 'http://www.rutor.org/'
self.search_params = '/search/0/1/100/0/' # only New movies
self.search_text = ""
self.count = 0
self.result = {}
def construct_search_text(self):
l = [self.title, self.year, self.qu]
l = filter(None, l)
search_text = " ".join(l)
self.search_text = search_text
return self.search_text[:] # [:] - magic
def construct_search_url(self):
search_link = "".join((self.main_domain, self.search_params, self.construct_search_text()))
print(search_link)
return search_link
def get_page_sourse(self):
r = requests.get(self.construct_search_url())
print("encoding is: "+r.encoding)
return r.text.encode(r.encoding) # r.encoding return used codec
def parse_it(self):
all_torrent_links_xpath = "//div[@id='index']//a[starts-with(@href, '/torrent')]"
page = lxml.html.document_fromstring(self.get_page_sourse())
print(self.get_page_sourse()) #here I printing source core for stackowerflow
all_torrent_links = page.xpath(all_torrent_links_xpath)
if all_torrent_links:
for link in all_torrent_links:
print(link)
if not (link.text.lower()).find(u'трейлер') != -1: # we don't need trailers
title = link.text_content()
torrent_file = link.getprevious().getprevious().attrib['href']
magnet = link.getprevious().attrib['href']
self.result[self.count] = {'title': title[:], 'torrent_file': torrent_file, 'magnet': magnet}
# I used [:] c'z title type is 'lxml.etree._ElementUnicodeResult' but not <unicode>
# because of lxml.html fromstring()
self.count += 1
if __name__ == '__main__':
m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
m.parse_it()
print(m.result)
If I run it in subline text - I will get nice html page source, and m.result is not empty. However, if I run this code in google app engine with flask:
import Rutor
...
@app.route('/test')
def test():
m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
m.parse_it()
pprint(m.result)
return 'test'
I will get wrong encoded text in my console and m.resutl is empty
I can't fix it more than 2 weeks, please help me.