Note that there are some explanatory texts on larger screens.

plurals
  1. PO
    primarykey
    data
    text
    <p>For anyone interested, what I did was make a superclass Graph which contained an instance variable __crawled and moved my crawling functions into Graph. Page now only contains attributes describing the page and its related pages. I pickle my instance of Graph which contains all my instances of Page. Here is my code.</p> <pre><code>from urllib import urlopen #from bs4 import BeautifulSoup import re import pickle ###################CLASS GRAPH#################### class Graph(object): def __init__(self,roots = [],crawled = {}): self.__roots = roots self.__crawled = crawled @property def roots(self): return self.__roots @property def crawled(self): return self.__crawled def crawl(self,page,url): if url not in self.__crawled: webpage = urlopen(url).read() patFinderTitle = re.compile('&lt;title&gt;(.*)&lt;/title&gt;') patFinderLink = re.compile('&lt;link rel="canonical" href="([^"]*)" /&gt;') patFinderRelated = re.compile('&lt;li&gt;&lt;a href="([^"]*)"') findPatTitle = re.findall(patFinderTitle, webpage) findPatLink = re.findall(patFinderLink, webpage) findPatRelated = re.findall(patFinderRelated, webpage) newPage = Page(findPatTitle,findPatLink,findPatRelated) page.related.append(newPage) self.__crawled[url] = newPage else: page.related.append(self.__crawled[url]) def crawlRelated(self,page): for link in page.relatedURLs: self.crawl(page,link) def crawlAll(self,obj,limit = 2,i = 0): print 'number of crawled pages:', len(self.crawled) i += 1 if i &gt; limit: return else: for rel in obj.related: print 'crawling', rel.title self.crawlRelated(rel) for rel2 in obj.related: self.crawlAll(rel2,limit,i) def loadGraph(self,filename): with open(filename,'r') as inf: return pickle.load(inf) def saveGraph(self,obj,filename): with open(filename,'w') as outf: pickle.dump(obj,outf) ###################CLASS PAGE##################### class Page(Graph): def __init__(self, title = '', link = '', relatedURLs = []): self.__title = title self.__link = link self.__relatedURLs = relatedURLs self.__related = [] @property def relatedURLs(self): return self.__relatedURLs @property def title(self): return self.__title @property def related(self): return self.__related ####################### MAIN ###################### def main(seed): print 'doing some work...' webpage = urlopen(seed).read() patFinderTitle = re.compile('&lt;title&gt;(.*)&lt;/title&gt;') patFinderLink = re.compile('&lt;link rel="canonical" href="([^"]*)" /&gt;') patFinderRelated = re.compile('&lt;li&gt;&lt;a href="([^"]*)"') findPatTitle = re.findall(patFinderTitle, webpage) findPatLink = re.findall(patFinderLink, webpage) findPatRelated = re.findall(patFinderRelated, webpage) print 'found the webpage', findPatTitle #root = Page(findPatTitle,findPatLink,findPatRelated) G = Graph([Page(findPatTitle,findPatLink,findPatRelated)]) print 'crawling related...' G.crawlRelated(G.roots[0]) G.crawlAll(G.roots[0]) print 'now saving...' G.saveGraph(G, 'medTwiceGraph.dat') print 'done' return G #####################END MAIN###################### #'http://medtwice.com/am-i-pregnant/' #'medTwiceGraph.dat' #G = main('http://medtwice.com/menopause-overview/') #print G.crawled def loadGraph(filename): with open(filename,'r') as inf: return pickle.load(inf) G = loadGraph('MedTwiceGraph.dat') print G.roots[0].title print G.roots[0].related print G.crawled for key in G.crawled: print G.crawled[key].title </code></pre>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload