def get_poster_details(elm):
user_info = elm.find(class_='user-details').a
return {
'user_login': None if not user_info else user_info.get_text(),
'user_url': None if not user_info else user_info['href'],
'date': elm.find(class_='user-action-time').span['title']
}
def get_comment_activity(elm):
user_link = elm.find(class_='comment-user')
return {
'user_login': user_link.get_text(),
'user_url': user_link['href'],
'date': elm.find(class_='comment-date').span['title']
}
def get_question_activity(url):
rv = {}
html = get_page(url)
soup = BeautifulSoup(html)
# question poster and comments
question = soup.find(class_='question')
poster = question.select('.postcell .post-signature.owner')[0]
rv['question'] = {
'asked_by': get_poster_details(poster),
'comments': [
get_comment_activity(elm)
for elm in question.select('.comments .comment-body')
]
}
answers = []
for answer in soup.select('#answers .answer'):
poster = answer.select('.answercell .user-info')[-1]
answers.append({
'answered_by': get_poster_details(poster),
'comments': [
get_comment_activity(elm)
for elm in answer.select('.comments .comment-body')
]
})
rv['answers'] = answers
return rv
Joe, The short answer is no, I don't think this idiomatic. In particular I would not mix CSS selectors and calls to find(). I'd use one or the other. However, it doesn't matter. Beautiful Soup is resolutely anti-idiomatic. It's intended to mesh with Python idioms (list comprehensions, method calls) better than domain-specific languages like CSS selectors and XPath do. But the attitude of Beautiful Soup is "no questions asked." Whatever works for you is fine. The important thing is that your Beautiful Soup code is consistent with the rest of your code. That said, if CSS selectors are your thing, you can get better performance by using lxml with its CSSSelect class. When I need speed, I personally use lxml and XPath expressions. But if you just need to sketch out some code until it can pull some data out of some webpages, Beautiful Soup is fine. Leonard