>>> s="""<html><body><p>text1</p><!--
OAS_RICH('x70');
// --><p>text2</p></body></html>"""
>>> from BeautifulSoup import BeautifulSoup as BS
>>> from BeautifulSoup import Comment
>>> bsoup=BS(s)
>>> comments = bsoup.findAll(text=lambda text:isinstance(text, Comment))
>>> [comment.extract() for comment in comments]
[None]
>>> body = bsoup.body(text=True)
>>> text = ''.join(body)
>>> print text
text1text2
All comments with // were deleted. May be your problem is because of
some invalid html.
One of the possible ways (but not the best) is to delete comments, using
regular expressions:
>>> s="""<html><body><p>text1</p><!--
OAS_RICH('x70');
// --><p>text2<!--other comment // --></p></body></html>"""
>>> s2 = re.compile('<!--.*?-->', re.DOTALL).sub('', s)
>>> print s2
<html><body><p>text1</p><p>text2</p></body></html>
Windows XP; python 2.5.1; BS 3.0.4
Best regards, Yuriy.
s='''<html><body><script type="text/javascript">
<!--
spFramebuster();
-->
</script> <script type="text/javascript">
<!--
OAS_RICH('x70');
-->
</script><br class="spBreakNoHeight" clear="all" /> <!-- ##SPONTAG:
LAYER## -->
</body>
</html> '''
>>> bsoup = BeautifulSoup.BeautifulSoup(s)
>>> comments = bsoup.findAll(text=lambda text:isinstance(text,
BeautifulSoup.Comment))
>>> [comment.extract() for comment in comments]
[None]
>>> body = bsoup.body(text=True)
>>> text = ''.join(body)
>>> print text
<!--
spFramebuster();
-->
<!--
OAS_RICH('x70');
-->
and without <script>:
s='''<html><body>
<!--
spFramebuster();
// -->
<!--
OAS_RICH('x70');
// -->
<br class="spBreakNoHeight" clear="all" /> <!-- ##SPONTAG: LAYER## -->
</body>
</html> '''
>>> bsoup = BeautifulSoup.BeautifulSoup(s)
>>> comments = bsoup.findAll(text=lambda text:isinstance(text,
BeautifulSoup.Comment))
>>> [comment.extract() for comment in comments]
[None, None, None]
>>> body = bsoup.body(text=True)
>>> text = ''.join(body)
>>> text
u'\n\n\n \n'
and with <script> but without "//":
s='''<html><body><script type="text/javascript">
<!--
spFramebuster();
-->
</script> <script type="text/javascript">
<!--
OAS_RICH('x70');
-->
</script><br class="spBreakNoHeight" clear="all" /> <!-- ##SPONTAG:
LAYER## -->
</body>
</html> '''
>>> bsoup = BeautifulSoup.BeautifulSoup(s)
>>> comments = bsoup.findAll(text=lambda text:isinstance(text,
BeautifulSoup.Comment))
>>> [comment.extract() for comment in comments]
[None]
>>> body = bsoup.body(text=True)
>>> text = ''.join(body)
>>> print text
<!--
spFramebuster();
-->
<!--
OAS_RICH('x70');
-->
So you have to kill all tags <script> with all content just like you
have killed all comments:
>>> s='''<html><body><script type="text/javascript">
<!--
spFramebuster();
// -->
</script> <script type="text/javascript">
<!--
OAS_RICH('x70');
// -->
</script><br class="spBreakNoHeight" clear="all" /> <!-- ##SPONTAG:
LAYER## -->
</body>
</html> '''
>>> bsoup = BeautifulSoup.BeautifulSoup(s)
>>> comments = bsoup.findAll(text=lambda text:isinstance(text,
BeautifulSoup.Comment))
>>> [comment.extract() for comment in comments]
[None]
>>> c=bsoup.findAll('script')
>>> for i in c:
i.extract()
>>> body = bsoup.body(text=True)
>>> text = ''.join(body)
>>> text
u' \n'