Table extraction from HTML

747 views
Skip to first unread message

Joel Frost

unread,
Nov 8, 2023, 2:36:23 PM11/8/23
to Google Apps Script Community
I have an app script that will go to a website and download the HTML from the site. It places the HTML is column V. I need to write an app script that will read the HTML in column V and extract a table with id 'grdOwners' and I have been using ChatGPT to write these scripts. Up till this one ChatGPT has been killing it. I have been able to work through the App Script Editor debug errors, etc, and I am getting scripts that do exactly what I want them to do. But this one is a bugger. 

ChatGPT has spent over 30 different attempts in writing, analyzing errors, editing, I have it making version numbers of each attempt so I am able to track which gets us the closest. It has used XMLParser, HTML stuff, it tried a formula called IMPORTXML. Nothing works. I am hoping that there is an expert here that can crack this thing and I can move on with the rest of the project. 

I am going to paste a sample of the HTML code from column V, as well as the latest attempt from ChatGPT. 

Any help would be great

SAMPLE HTML:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title> Salt Lake County Recorder - LOGIN </title><meta name="author" content="Salt Lake County Recorder" /><meta name="googlebot" content="noodp" /><meta name="slurp" content="noydir" /><meta name="msnbot" content="noodp" /><meta name="robots" content="noodp" /><meta name="robots" content="all" /><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta name="description" content="Salt Lake County Recorder provides access to public information about records and recording in the County of Salt Lake." lang="en" /><meta name="keywords" content="Salt Lake County Recorder, Data Services, Documents, Maps, Parcels, Plats, Property, Abstract, Omni, GIS, Recorder Tax, Data Searches, Records, Recording, Rashelle Hobbs." /><meta name="Language" content="english" /><meta name="copyright" content="Copyright 2012-2019 Salt Lake County Recorder" /><meta name="application-name" content="recorder.slco.org" /><meta name="msapplication-starturl" content="https://recorder.slco.org" /><link rel="Stylesheet" href="css/DataServicesSearches.css" type="text/css" media="all" /><link rel="shortcut icon" href="slcr_favicon/favicon.png" /> <script src="jqscripts/OpenWindows.js" type="text/javascript"></script> <style type="text/css"> .auto-style1 { height: 58px; } </style> <!-- Begin Google Analytics --> <script async src="https://www.googletagmanager.com/gtag/js?id=G-1KNH50XV3B"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-1KNH50XV3B'); (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); ga('create', 'UA-35053227-1', 'slco.org'); ga('send', 'pageview'); </script> <!--End Google Analytics --> <!--Begin Monsido --> <script type="text/javascript" async="" src="https://app-script.monsido.com/v2/monsido-script.js"></script> <script type="text/javascript"> window._monsido = window._monsido || { token: "SOycXs72AOCzqLeoC5s3JQ", statistics: { enabled: true, cookieLessTracking: true, documentTracking: { enabled: true, documentCls: "monsido_download", documentIgnoreCls: "monsido_ignore_download", documentExt: ["pdf","doc"], }, }, }; </script> <!--End Monsido --> <!--Begin MSClarity--> <script type="text/javascript"> (function(c,l,a,r,i,t,y){ c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)}; t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i; y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y); })(window, document, "clarity", "script", "ijxq04kjvg"); </script> <!--End MSClarity--> <!-- Hotjar Tracking Code for https://slco.org --> <script> (function(h,o,t,j,a,r){ h.hj=h.hj||function(){(h.hj.q=h.hj.q||[]).push(arguments)}; h._hjSettings={hjid:927949,hjsv:6}; a=o.getElementsByTagName('head')[0]; r=o.createElement('script');r.async=1; r.src=t+h._hjSettings.hjid+j+h._hjSettings.hjsv; a.appendChild(r); })(window,document,'https://static.hotjar.com/c/hotjar-','.js?sv='); </script> <!-- End Hotjar Tracking Code for https://slco.org --></head> <body style="background-color: #424242"> <form method="post" action="./SLCRLogin.aspx?ReturnUrl=%2fdata-services%2fSearch%2fParcelDataDocuments.aspx" onkeypress="javascript:return WebForm_FireDefaultButton(event, 'btnSignIn')" id="form1"> <div class="aspNetHidden"> <input type="hidden" name="__LASTFOCUS" id="__LASTFOCUS" value="" /> <input type="hidden" name="__EVENTTARGET" id="__EVENTTARGET" value="" /> <input type="hidden" name="__EVENTARGUMENT" id="__EVENTARGUMENT" value="" /> <input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKMTk3NTczMzU0MA9kFgICAw9kFgICAw8PFgIeBFRleHQFJlN1bmRheSBOb3ZlbWJlciAwNSwgMjAyMyAtIDEwOjAzOjIyIFBNZGQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFC0ltZ1NMQ1JMb2dvyVCBlFXfpAJQfBgf4I3f3AT4AIHiOYEUw02i95q0SMA=" /> </div> <script type="text/javascript"> //<![CDATA[ var theForm = document.forms['form1']; if (!theForm) { theForm = document.form1; } function __doPostBack(eventTarget, eventArgument) { if (!theForm.onsubmit || (theForm.onsubmit() != false)) { theForm.__EVENTTARGET.value = eventTarget; theForm.__EVENTARGUMENT.value = eventArgument; theForm.submit(); } } //]]> </script> <script src="/data-services/WebResource.axd?d=DjdZFq2WGKwdrlKtj7uoOe0cv7apc3FiComuLjv8PQ9R_5bsXGJrdVAzt-sVKzic3BbjL6kLsfGfLfCr5SjZBDiHvqkc62e5xusZQenLYFI1&amp;t=638285863360000000" type="text/javascript"></script> <script src="/data-services/ScriptResource.axd?d=-JwSf2AnceQZYLrtI-449vehdcvRSkjFTUyH208Vae5RmG1S24pYaPGASIu9JhwvM9wmzj9HIuI5STn5P2VFnGmOLTzhNSLR3KtkxmySwWvgf33z5UuejOF-bsApalxW0lU109K9lbzqRx1XlYvRPbgMUfEsLuRLUn042r0QDCk1&amp;t=13798092" type="text/javascript"></script> <script src="/data-services/ScriptResource.axd?d=Yd3QQ-McKUjLRHtnf6SJurVgqNDHteAUyXZXPd3CoxeqAnt3ttrw0dIyJLFRKnMg-3YdBtvOVFRKk9LqNK0vYAD3kn3MfrpsVDFQdkg64L9_Qd7C_-gJX0GBrmktZCMZFokJlMyIKOnA5wr4wZuBb7-sNANuxaeoygMdmnFWfR3SFWZW1D_3QlDNfbFFkcqh0&amp;t=13798092" type="text/javascript"></script> <script src="/data-services/WebResource.axd?d=Gk2csAiGotrTIiXjVWBZ-YamvfxoPpUMww01NPKmBw2xQ5Jh_JMfgn0_8bQXrjtk-Og8327GUXCbW5cTdwy3rfYydubBbD6INTHHeXmTb_41&amp;t=638285863360000000" type="text/javascript"></script> <div class="aspNetHidden"> <input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="8ACBB5E2" /> <input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEdAAcbQlxGoLSr3qJgq0Xx/wpPrT8C6VxSoCa8b9hXk2pDPutNuoVwSdz/ktY4aU3x1XvZLo06Fl6DYP/kYTXPzdB2r5/bwY1XfX7PrkNsOrExLvEKDGUbL6X1U4S+8ZrYROb2r5RJnV8b+a6vxqLKu75Li6c3/dvUeGTEm/YNzGqkmL09405llAEatGVBK8kdzds=" /> </div> <script type="text/javascript"> //<![CDATA[ Sys.WebForms.PageRequestManager._initialize('ScriptManager1', 'form1', [], [], [], 90, ''); //]]> </script> <div style="padding: 22px; width: 5px"></div> <div id="page"> <table style="margin-top: 11px; margin-left: 5px; color: #FFFFFF; font-family: Tahoma; font-size: 12px"> <tr> <td class="style45"> <span style="font-family: Tahoma; font-size: 12px; margin-left: 3px">Salt Lake City,</span> <span id="LblDateTime" style="color: #FFFFFF; margin-left: 0px">Sunday November 05, 2023 - 10:03:22 PM</span> </td> <td class="style46"> <span style="color: #f0efd9; font-family: Tahoma; font-size: 12px; margin-left: 777px">Cross Browser Support: <a href="javascript: openwindowBrowserSettings()" title="Browser Settings"> <img id="ImgBrowser" title="Browser Compatibility Support: IE, Safari, Opera, Firefox and Chrome." src="images/browser_logos_.png" style="border-width: 0px; outline: none; outline-style: none; outline-width: 0px;" /> </a></span> </td> </tr> </table> <div id="wrap"> <div id="header" style="margin-top: -0px"> <div> <table> <tr> <td> <table> <tr> <td> <input type="image" name="ImgSLCRLogo" id="ImgSLCRLogo" title="Return to Recorder Home" src="images/NewCountyRecorderLogo.jpg" style="outline: none; outline-style: none; margin-left: 10px" /> </td> </tr> <tr> <td> <div style="padding: 5px"></div> <a href="/recorder" title="Return to Recorder Home" style="margin-left: 13px; color: black; font-size: 14pt; text-decoration: underline; font-family: 'segoe regular'">Recorder Home</a> <a href="/data-services" title="Return to Data Services Home" style="margin-left: 13px; color: black; font-size: 14pt; text-decoration: underline; font-family: 'segoe regular'">Data Services Home</a> </td> </tr> </table> </td> </tr> </table> </div> </div> <div style="padding: 25px"></div> <table style="color: #000000; font-family: Tahoma; background: #2d4262; width: 1405px; height: 33px; margin-left: 10px; text-transform: uppercase; font-size: 11pt; color: white; letter-spacing: .5em;"> <tr> <td> <span style="margin-left: 557px"></span> </td> </tr> </table> <div style="padding: 5px"></div> <div id="headerNewLogin" style="margin-left: 164px"> <h1 id="logo" style="height: 36px;">Salt Lake County<span class="Red" style="margin-top: -45px">&nbsp;Recorder</span></h1> </div> <div style="padding: 5px"></div> <table style="width: 93%; height: 475px; margin-left: 5px"> <tr> <td style="width: 37%"> <div style="width: 332px; top: 15px; left: 45px; margin-left: 65px"> <p><span style="font-size: 12pt; font-family: Tahoma;"><b>$5 24-Hour Access Users:</b></span></p> <p><span style="font-size: 12pt; font-family: Tahoma;"><a href="https://slco.org/data-services/SearchDay/DataSearchesDay.aspx">Please Login Here</a> </span></p> </div> <br /><br /> <div style="width: 332px; top: 15px; left: 45px; margin-left: 65px"> <p><span style="font-size: 12pt; font-family: Tahoma;"><b>Account Users:</b></span></p> <p><span style="font-size: 11pt; font-family: Tahoma;">For Password Help Please call us at:</span></p> <p><span style="font-size: 11pt; font-family: Tahoma;"><b>385-468-8145</b></span></p> <p><span style="font-size: 11pt; font-family: Tahoma;">Or email us at: <a href="mailto:Reco...@slco.org?subject= Data Services Password Help"><b>Password Help</b></a></span></p> <p><span style="font-size: 11pt; font-family: Tahoma;">Thank you</span></p> </div> </td> <td> <div class="loginNew" style="margin-left: 44px"> <table style="width: 100%; height: 43px; margin-left: -5px;"> <tr> <td style="vertical-align: top" class="auto-style1"> <img id="Image5" src="images/SecureLogin.png" style="margin-left: 15px; margin-top: -5px;" /> <div id="hRuler" class="horizontalRule"></div> </td> </tr> </table> <div style="padding: 5px"></div> <table style="width: 100%;"> <tr> <td> <label for="username" style="font-family: Tahoma; font-size: 16px; font-weight: bold;">Username:</label> </td> </tr> </table> <input name="TxtUsername" type="text" maxlength="8" id="TxtUsername" tabindex="1" class="inputNewLoginUsername " style="font-family:arial;font-size:13pt;" /> <div style="padding: 5px"></div> <table style="width: 100%;"> <tr> <td class="auto-style1"> <label for="password" style="margin-left: 0px; font-family: Tahoma; font-size: 16px; font-weight: bold;">Password:</label> </td> </tr> </table> <input name="TxtPassword" type="password" maxlength="21" id="TxtPassword" tabindex="2" class="inputNewLoginPassword" style="font-family:arial;font-size:13pt;" /> <div id="lowerNewLogin"> <div style="padding: 5px"> <span id="labPasswordStatus"></span> </div> <table> <tr> <td> <input type="submit" name="btnSignIn" value="Log In" id="btnSignIn" tabindex="3" title="Click To Sign In To Your Account" class="newbtnNewLogin" style="width:148px;" /> </td> </tr> <tr> <td>&nbsp</td> </tr> <tr> <td> <input type="submit" name="BtnResetLogin" value="Clear" id="BtnResetLogin" title="Reset Username &amp; Password" class="newbtnNewLogin" style="width:148px;" /> </td> <td> <input type="submit" name="btnChangePassword" value="Change Password" id="btnChangePassword" title="Change Password" class="newbtnNewLogin" style="width:148px;margin-left: 5px" /> </td> </tr> </table> </div> </div> </td> <td style="width: 55px"></td> <td style="width: 555px"> <span style="font-family: Tahoma; text-align: justify; font-size: 11pt; color: #000000; margin-top: 5px"> <b>ATTENTION RECORDER’S OFFICE USERS: </b> <br /> <br /> As a result of the Utah State Legislature’s passage of HB247, there will be a change in the Recorder’s Offices Fee Schedule effective <b>May 14, 2019.</b> To see details of this change please <a href="javascript: openwindowRecorderLetter()" title="View Recorder's Letter" style="font-family: Tahoma; font-weight: bold; font-size: 11pt">click here.</a> </span> <br /> <br /> <span style="font-family: Tahoma; text-align: justify; font-size: 11pt; color: #000000; margin-top: 5px">To view the new Fee Schedule, please <a href="/recorder/requirements/"><b><span style="color:navy">click here.</span></b></a> </span> <br /> <br /> <img id="Image6" src="images/blk_arrow.gif" alt="Black Arrow" /> <span style="font-family: Tahoma; text-align: justify; font-size: 11pt; color: #000000; margin-top: 5px">&nbsp;&nbsp; All records (excluding plats) will be charged a $40 flat-fee instead of a per page fee with additional fees for descriptions in excess of 10.</span> <br /> <br /> <img id="Image7" src="images/blk_arrow.gif" alt="Black Arrow" /> <span style="font-family: Tahoma; text-align: justify; font-size: 11pt; color: #000000; margin-top: 5px">&nbsp;&nbsp; All plats will have a $50/sheet charge with an additional fee of $2 per lot or unit.</span> </td> </tr> </table> <table style="width: 100%"> <tr> <td style="text-align: center"> <span id="Label2" style="font-family: Tahoma; font-size: 12pt; color: black; margin-left: 30px; font-weight: bold;">Need a New Account? <a href="/recorder/data-services-overview-and-agreement/"><b><span style="color:navy">Data Services Overview and User Agreement</span></b></a> </span> </td> </tr> <tr> <td style="text-align: center"> <span id="Label1" style="font-family: Tahoma; font-size: 12pt; color: black; margin-left: 30px; font-weight: bold;">For Technical support:</span> <span style="font-family: Tahoma; font-size: 12pt; color: #000000; margin-left: 5px">Please call<b> (385) 468-8159</b></span> </td> </tr> </table> <table style="width: 100%"> <tr> <td style="text-align: Center"> <span style="font-family: Tahoma; font-size: 12pt; color: #000000">This login page is for <b>website access</b> only. To reset your mainframe password <b>after hours</b>, call the Information Services Department - Help Desk at (385) 468-0700.</span> </td> </tr> </table> </div> </div> <script type="text/javascript"> //<![CDATA[ WebForm_AutoFocus('TxtUsername');//]]> </script> </form> </body> </html>

LATEST VERSION FROM CHATGPT
function extractAndPlaceTableInColumnW() { var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); var data = sheet.getDataRange().getValues(); var tableId = 'grdOwners'; // Loop through each row, starting from row 2 (assuming row 1 is the header) for (var i = 1; i < data.length; i++) { var htmlContent = data[i][21]; // Assuming column V is the 22nd column (0-based index) if (htmlContent) { try { var extractedTable = extractTableByIdFromHTML(htmlContent, tableId); if (extractedTable) { // Update the value in column W (index 22) sheet.getRange(i + 1, 22).setValue(extractedTable); } else { sheet.getRange(i + 1, 22).setValue("Table with ID 'grdOwners' not found"); } } catch (e) { sheet.getRange(i + 1, 22).setValue("Error extracting table"); } } } } // This function extracts the table with a specific ID from the HTML string. function extractTableByIdFromHTML(html, tableId) { var div = HtmlService.createHtmlOutput(html); var table = div.getContent().querySelector('#' + tableId); if (table) { return table.outerHTML; } return null; }  

Brett Grear

unread,
Nov 19, 2023, 4:33:58 AM11/19/23
to Google Apps Script Community
Can't you use =importhtml("urlhere","table",1) Change 1 for the table number on the page, it might take trial and error

Emerson Maia

unread,
Nov 19, 2023, 6:55:04 AM11/19/23
to google-apps-sc...@googlegroups.com
Você pode tentar o seguinte .:
function extractAndPlaceTableInColumnW() { const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); const data = sheet.getDataRange().getValues(); data.slice(1).forEach((row, index) => { const htmlContent = row[21]; // Coluna V é a 22ª coluna (índice baseado em 0) if (htmlContent) { const extractedTable = extractTableByIdFromHTML(htmlContent, 'grdOwners'); const output = extractedTable ? extractedTable : "Table with ID 'grdOwners' not found"; sheet.getRange(index + 2, 23).setValue(output); // +2 porque estamos começando da segunda linha e o índice é baseado em 1 } }); } function extractTableByIdFromHTML(html, tableId) { try { const doc = XmlService.parse(html); const root = doc.getRootElement(); const tables = getElementsByTagAndAttribute(root, 'table', 'id', tableId); return tables.length > 0 ? XmlService.getRawFormat().format(tables[0]) : null; } catch (e) { Logger.log(e.toString()); return null; } } const getElementsByTagAndAttribute = (element, tagName, attributeName, attributeValue) => { let results = []; const descendants = element.getDescendants(); descendants.forEach(descendant => { const elem = descendant.asElement(); if (elem && elem.getName() === tagName) { const attribute = elem.getAttribute(attributeName); if (attribute && attribute.getValue() === attributeValue) { results.push(elem); } } }); return results; }


--
You received this message because you are subscribed to the Google Groups "Google Apps Script Community" group.
To unsubscribe from this group and stop receiving emails from it, send an email to google-apps-script-c...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/google-apps-script-community/8502089a-aef3-40ba-8dd2-707e20c33a1en%40googlegroups.com.
Reply all
Reply to author
Forward
0 new messages