Note that there are some explanatory texts on larger screens.

plurals
  1. PO
    text
    copied!<p>So maybe you don't want to walk the divs, but here is my solution using lxml, which I highly recommend:</p> <pre><code>import re from cStringIO import StringIO from lxml import etree def getTable(html, table_xpath, rows_xpath, cells_xpath): """Get a table on a webpage""" parser = etree.HTMLParser() # Build document tree and get table root = etree.parse(StringIO(html), parser) table = root.find(table_xpath) if table == None: print 'No table.' return [] rows = table.findall(rows_xpath) document = [] def cleanText(text): """Clean up text by replacing line breaks and tabs. """ return re.sub(r'[\r\n\t]+','',str(text).strip()) # iterate over the table rows and collect text from each cell. for r in rows: cells = r.findall(cells_xpath) rowdata = [] for c in cells: text = '' it = c.itertext() for i in it: text += cleanText(i) + ' ' rowdata.append(text) document.append(rowdata) return document html = """ &lt;html&gt;&lt;head&gt;&lt;title&gt;&lt;/title&gt;&lt;/head&gt;&lt;body&gt; &lt;p align="center"&gt; &lt;img src="some_image.gif" alt="Some Title"&gt; &lt;/p&gt; &lt;TABLE WIDTH=500 BORDER=1 class=textwhite ALIGN=center CELLPADDING=0 CELLSPACING=0&gt; &lt;TR&gt; &lt;TD colspan=4 ALIGN=center&gt;&lt;b&gt;Title&lt;/b&gt;&lt;/TD&gt; &lt;/TR&gt; &lt;TR&gt; &lt;TD ALIGN=center&gt;Title&lt;/TD&gt; &lt;TD ALIGN=center&gt;date&lt;/TD&gt; &lt;TD ALIGN=center&gt;value&lt;/TD&gt; &lt;TD ALIGN=center&gt;value&lt;/TD&gt; &lt;/TR&gt;&lt;TR&gt; &lt;TD ALIGN=center&gt;Title2&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;div class=redtext&gt;----&lt;/div&gt;&lt;/TD&gt; &lt;TD&gt;&amp;nbsp;&lt;/TD&gt; &lt;/TR&gt;&lt;TR&gt; &lt;TD ALIGN=center&gt;Title3&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;div class=yellowtext&gt;value&lt;/div&gt;&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;div class=redtext&gt;value&lt;/div&gt;&lt;/TD&gt; &lt;TD ALIGN=center&gt;value&lt;SUP&gt;6&lt;/SUP&gt;&lt;/TD&gt; &lt;/TR&gt;&lt;TR&gt; &lt;TD ALIGN=center&gt;Title4&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;div class=bluetext&gt;value&lt;/div&gt;&lt;/TD&gt; &lt;TD ALIGN=center&gt;&lt;div class=redtext&gt;value&lt;/div&gt;&lt;/TD&gt; &lt;TD&gt;&amp;nbsp;&lt;/TD&gt; &lt;/TR&gt;&lt;/TABLE&gt; &lt;/body&gt; &lt;/html&gt; """ tp = "//table[@width='500']" rt = "tr" cp = "td[@align='center']" doc = getTable(html, tp, rt, cp) print repr(doc) </code></pre>
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload