StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO
text
Body
copied!<p>my example:</p> <pre><code>HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5", "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language" : "ru,en-us;q=0.7,en;q=0.3", "Accept-Charset" : "windows-1251,utf-8;q=0.7,*;q=0.7", "Accept-Encoding" : "identity, *;q=0", "Connection" : "Keep-Alive"} PROXY=None timeout=60 def parse_manuf_page_about(page_str_about): slovar={} global timeout socket.setdefaulttimeout(timeout) if PROXY is not None: proxy_handler = urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } ) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) page_request = urllib2.Request(url=page_str_about, headers=HEADERS) try: #print "Page reading ... %s" %page_str page_zapr = urllib2.urlopen(url=page_request) page=page_zapr.read() except Exception ,error: print str(error) res=False return res,slovar soup = BeautifulSoup(page) select_pod=soup.findAll('div', {"class":"win aboutUs"}) promeg= select_pod[0].findAll("p")[0] zerro_br= promeg.findAll(text=True) Company_Info=" ".join(zerro_br).strip(" \t\n") select =soup.findAll('div', {"class":"win"}) cells_tabl= select[0].findAll("tr") for yach in cells_tabl: text_zag=yach.findAll("th") for zn_yach in text_zag: if len(zn_yach)>0: txt_zn_yach="".join(zn_yach.findAll(text=True)).strip(" \t\n") else: txt_zn_yach= zn_yach.contents[0].strip(" \t\n") #print txt_zn_yach text_znach_td=yach.findAll("td") for zn_yach_td in text_znach_td: if len(zn_yach_td)>0: txt_zn_yach_td="".join(zn_yach_td.findAll(text=True)).strip(" \t\n") else: txt_zn_yach_td= zn_yach.contents[0].strip(" \t\n") #print txt_zn_yach_td # Делаем замены неугодных символов / Replase browsers char if "&nbsp" in txt_zn_yach_td: while txt_zn_yach_td.find("nbsp;")>0: pos_gavna=txt_zn_yach_td.find("&nbsp;") txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+txt_zn_yach_td[pos_gavna+6:] if "&quot" in txt_zn_yach_td: while txt_zn_yach_td.find("quot;")>0: pos_gavna=txt_zn_yach_td.find("&quot;") txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'"'+txt_zn_yach_td[pos_gavna+6:] if "&amp;" in txt_zn_yach_td: while txt_zn_yach_td.find("&amp;")>0: pos_gavna=txt_zn_yach_td.find("&amp;") txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'&'+txt_zn_yach_td[pos_gavna+6:] slovar[str(txt_zn_yach)]=txt_zn_yach_td slovar["Company_Info"]=Company_Info # разбираем нижнюю таблицу с контактом и вытаскиваем оттуда имя контакта | get name contacts select_contact=soup.findAll('a', {"class":"member-name"}) for contact_person in select_contact: slovar["Contact_Person"]= contact_person.contents[0] # получаем статус голд партнера по наличию таблички в левом верхнем углу | get Gold status select_gold_part=soup.findAll('a', {"class":"memberLogo"}) if len(select_gold_part)==0: slovar["Gold member"]="N" else: slovar["Gold member"]="Y" res=True return res,slovar </code></pre> <p>This code parsing one page of manufactury on Alibaba.com. You can see it page - <a href="http://xmxinhuafeng.en.alibaba.com/aboutus.html" rel="nofollow noreferrer">http://xmxinhuafeng.en.alibaba.com/aboutus.html</a></p>

Querying!

Guidance

An individual column

Larger individual text columns get their own page to allow for proper reading.

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload