Hi there, I have a project that was using quite an old version of mochiweb. I tried updating to the most recent version, but found a problem in mochiweb_html.
I've narrowed it down to a specific commit - 203499a9e108bed8cf2cf6d8b8606128ed806361.
The problem can be seen running the following code, with the attached sample google search results:
{ok,DocBin} = file:read_file(filename:join("/tmp","dentists-dublin-google.html")).
mochiweb_html:parse(DocBin).
{<<"head">>,[],
[{<<"title">>,[],[<<"dentists dublin - Google Search">>]},
{<<"script">>,[],
[<<"
window.google={kEI:\"sHsdTq30NILDhAfTyOTCBw\",kEXPI:\"17311,27731,28505,28936,30316"...>>]},
{<<"script">>,[],
[<<"
window.google.sn=\"web\";var i=window.google.timers={};window.google.startTick"...>>]},
{<<"style">>,
[{<<"id">>,<<"gstyle">>}],
[<<"body{color:#000;margin:3px 0;overflow-y:scroll}body,#leftnav,#tbdi,#hidd"...>>]},
{<<"script">>,[],
[<<"var _gjwl=location;function _gjuc(){var b=_gjwl.href.indexOf(\"#\");if"...>>]},
{<<"script">>,[],
[<<"google.y={};google.x=function(e,g){google.y[
e.id]=[e,g];return f"...>>]},
{<<"script">>,[],
{<<"script">>,[],
[<<"(function(){try{var e=true,j=false;var m=window.gbar=win"...>>]},
{<<"script">>,[],
[<<"(function(){var is_plus_one_user=true;window.gbar&&g"...>>]}]}
Whereas, if you revert to the previous commit, d163f12caa60f6bf61a8b3392091dc3c2781c53e, you get these results:
mochiweb_html:parse(DocBin).
{<<"!doctype">>,
[{<<"html">>,<<"html">>}],
[{<<"head">>,[],
[{<<"title">>,[],[<<"dentists dublin - Google Search">>]},
{<<"script">>,[],
[<<"
window.google={kEI:\"sHsdTq30NILDhAfTyOTCBw\",kEXPI:\"17311,27731,2"...>>]},
{<<"script">>,[],
{<<"style">>,
[{<<"id">>,<<"gstyle">>}],
[<<"body{color:#000;margin:3px 0;overflow-y:scroll}body,#lef"...>>]},
{<<"script">>,[],
[<<"var _gjwl=location;function _gjuc(){var b=_gjwl.href"...>>]},
{<<"script">>,[],
[<<"google.y={};google.x=function(e,g){google.y[
e.id"...>>]},
{<<"script">>,[],
{<<"script">>,[],
[<<"(function(){try{var e=true,j=false;var m"...>>]},
{<<"script">>,[],
[<<"(function(){var is_plus_one_user=tru"...>>]}]},
{<<"body">>,
[{<<"dir">>,<<"ltr">>},
{<<"lang">>,<<"en-IE">>},
{<<"id">>,<<"gsr">>},
{<<"topmargin">>,<<"3">>},
{<<"marginheight">>,<<"3">>}],
[{<<"noscript">>,[],
[{<<"style">>,[],
[<<" .jsb{ display:none } .nojsb{ display:block } .n"...>>]}]},
{<<"div">>,
[{<<"id">>,<<"mngb">>}],
[{<<"div">>,
[{<<"id">>,<<"gb">>}],
[{<<"div">>,
[{<<"id">>,<<"gbw">>}],
[{<<"div">>,
[{<<"id">>,<<"gbz">>}],
[{<<"span">>,[{...}],[]},{<<"ol">>,[...],...}]},
{<<"div">>,
[{<<"id">>,<<"gbg">>}],
[{<<"h2">>,[...],...},{<<...>>,...},{...}]}]},
{<<"div">>,[{<<"id">>,<<"gbx3">>}],[]},
{<<"div">>,[{<<"id">>,<<"gbx4">>}],[]}]}]},
{<<"script">>,[],
[<<"if(google.j.b)document.body.style.display='none';">>]},
{<<"iframe">>,
[{<<"onload">>,<<"google.j.l()">>},
{<<"onerror">>,<<"google.j.e()">>},
{<<"style">>,<<"display:none">>},
{<<"src">>,<<"/blank.html">>},
{<<"name">>,<<"wgjf">>}],
[]},
{<<"textarea">>,
[{<<"name">>,<<"csi">>},
{<<"style">>,<<"display:none">>},
{<<"id">>,<<"csi">>}],
[<<>>]},
{<<"textarea">>,
[{<<"name">>,<<"wwcache">>},
{<<"style">>,<<"display:none">>},
{<<"id">>,<<"wwcache">>}],
[<<>>]},
{<<"textarea">>,
[{<<"name">>,<<"wgjc">>},
{<<"style">>,<<"display:none">>},
{<<"id">>,<<"wgjc">>}],
[<<>>]},
{<<"textarea">>,
[{<<"name">>,<<"hcache">>},
{<<"style">>,<<"display:none">>},
{<<"id">>,<<"hcache">>}],
[<<>>]},
{<<"a">>,
[{<<"style">>,<<"left:-1000em;position:absolu"...>>},
{<<"href">>,<<"/setprefs?prev=
http://ww"...>>}],
[<<"Screen-reader users, click here "...>>]},
{<<"div">>,
[{<<"id">>,<<"main">>}],
[{<<"div">>,[],[{<<"div">>,[{...}],[...]}]}]}]}]}
Basically all the body and it's content is getting lost.