StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

POmostly finished reddit crawler is failing to crawl more than 9 or so pages
primarykey
Id
20767308
data
AcceptedAnswerId
0
AnswerCount
1
ClosedDate
CommentCount
8
CommunityOwnedDate
CreationDate
2013-12-24T22:16:49.483
FavoriteCount
1
LastActivityDate
2014-01-07T21:05:36.667
LastEditDate
2013-12-24T22:44:46.060
LastEditorUserId
1650393
OwnerUserId
1650393
ParentId
0
PostTypeId
1
Score
0
ViewCount
898
LastEditorDisplayName
text
Body
<pre><code>import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Scraper { private String filePath = "c://reddit//"; private String url; private int count; private String after; private static String subreddit; public Scraper(String sr) { url = String.format("http://www.reddit.com/r/%s.xml?limit=100", sr); } public static void main(String[] args) { Scanner input = new Scanner(System.in); System.out.println("enter subreddit with pics only"); subreddit = input.next(); System.out.println("enter amount of pages to crawl"); int pages = input.nextInt(); Scraper scraper = new Scraper(subreddit); input.close(); int i = 0; while (i < pages) { scraper.getNextPage(); scraper.getImgur(); scraper.getImgurA(); scraper.getImgurAddI(); i++; } } public void download(String _url, String name) { /* * setup streams.. write image as bytes to filePath */ InputStream is = null; OutputStream os = null; try { URL url = new URL(_url); is = url.openStream(); os = new FileOutputStream(filePath + name + ".jpg"); for (int b; (b = is.read()) != -1;) { os.write(b); } } catch (MalformedURLException mue) { System.out.println("invalid url"); } catch (IOException e) { System.out.println("no stream"); } finally { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } if (os != null) { try { os.close(); } catch (IOException e) { e.printStackTrace(); } } } } public void getImgur() { /* * grab all imgur's in the context of http://i.imgur.com/. The second * parameter to download() is the filename */ try { System.out.println("connecting to imgur"); Elements description = getSubreddit(); for (Element imgur : description) { Pattern pattern = Pattern .compile("http://i\\.imgur\\.com/\\w+"); Matcher matcher = pattern.matcher(imgur.text()); if (matcher.find()) { System.out.println("downloading image: " + " " + matcher.group()); download((matcher.group() + ".jpg"), matcher.group() .substring(18)); } } } catch (Exception e) { System.out.println("getImgur() failed"); } finally { System.out.println("grabbed all imgurs"); } } public void getImgurAddI() { /* * grab all imgur's in the context of http://imgur.com/, if it is an * album then skip otherwise add "i" to beginning of imgur in order to * get image */ try { System.out.println("finding imgurs without prefix i and adding i"); Elements description = getSubreddit(); for (Element imgur : description) { Pattern pattern = Pattern.compile("http://imgur\\.com/\\w+"); Matcher matcher = pattern.matcher(imgur.text()); if (matcher.find()) { if (!matcher.group().endsWith("a")) { // make imgur downloadable by adding 'i' before imgur String newUrl = matcher.group(); newUrl = "http://i." + newUrl.substring(7); download(newUrl + ".jpg", newUrl.substring(18)); } } } } catch (Exception e) { System.out.println("getImgurAddI() failed"); } finally { System.out.println("grabbed all imgurs by adding I"); } } private void getImgurA() { /* * grab all albums then call extract() to get each individual image */ try { System.out.println("connecting to imgur album"); Elements description = getSubreddit(); for (Element imgur : description) { Pattern pattern = Pattern.compile("http://imgur.com/a/\\w+"); Matcher matcher = pattern.matcher(imgur.text()); if (matcher.find()) { System.out.println("Downloading image album...." + " " + matcher.group()); extract(matcher.group()); } } } catch (Exception e) { System.out.println("getImgurA() failed"); } finally { System.out.println("extracted all imgur albums"); } } private void extract(String album) { /* * open connection to imgur album and download each individual image, * validate imgur..if it ends with "s" most likely a thumbnail duplicate * so skip it */ try { Document doc = Jsoup.connect(album).get(); Elements pics = doc.getElementsByTag("img"); String image = null; for (Element pic : pics) { /* * get all image's inside the data-src attribute, make sure url * is valid first */ image = pic.attr("data-src"); if (image != "" && (!image.substring(0, image.length() - 4).endsWith( "s"))) { if (image.endsWith(".jpg?1") || image.endsWith(".jpg?2")) { if (image.substring(2, image.length() - 6) .endsWith("s")) { System.out .println("skipping download of thumbnail/duplicate"); } else { System.out.println("extracting jpg1/jpg2..... " + image.substring(2)); download( "http://" + image.substring(2, image.length() - 2), image.substring(14, image.length() - 6)); } } else { System.out.println("extracting..... " + image.substring(2)); download("http://" + image.substring(2), image.substring(14)); } } } } catch (IOException e) { System.out.println("extract() failed"); } } public Elements getSubreddit() { /* * return an Elements with the information to be scraped * to caller method, setup user agent */ Document doc; Elements description = null; try { doc = Jsoup .connect(url) .userAgent( "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") .referrer("http://www.google.com").get(); description = doc.getElementsByTag("description"); } catch (IOException e) { System.out.println("getSubreddit() failed"); } return description; } public void getNextPage() { /* * crawls current url to get next url */ System.out.println("Crawling next page.............."); Document doc; try { url = url.replace(".xml", ""); doc = Jsoup.connect(url).get(); Elements next = doc.getElementsByTag("span"); for (Element n : next) { if (n.className().equals("nextprev")) { Pattern pattern = Pattern.compile("after=\\w+"); Matcher matcher = pattern.matcher(n.toString()); if (matcher.find()) { after = matcher.group().substring(6); count += 100; url = String .format("http://www.reddit.com/r/%s.xml?limit=100&count=%d&after=%s", subreddit, count, after); System.out.println("Crawling page.........: " + url); } } } } catch (IOException e) { System.out.println("getNextPage() failed"); } } } </code></pre> <p>Sorry this is probably hard to read I have not broken it up yet because I am still working out the kinks. It seems to crawl 9 pages without a single error than every page after that just fails, the issue is "socket timed out" or "connection timed out". Here is an example output of trying to crawl 25 pages <code>http://pastebin.com/sP9UwGk9</code>. The bigger issue is that I had it actually be a multi threaded crawler, but it failed 50x more so I made it slower. I added in bunch of Thread.sleep every time the website connected or started a download but I still kept getting errors. Is there something I am doing wrong? I know reddit has some type of limiter but I am not sure whats wrong because this program is pretty slow in the first place(unless I use the threads). </p> <p>edit: console log <a href="http://pastebin.com/fhrjSeKx" rel="nofollow">http://pastebin.com/fhrjSeKx</a></p>
Tags
<java><web-crawler><reddit>
Title
mostly finished reddit crawler is failing to crawl more than 9 or so pages
singulars
PostAcceptedAnswerId
1. This table or related slice is empty.
PostParentId
1. This table or related slice is empty.
PostTypePostTypeId
1. PTQuestion
UserLastEditorUserId
1. USgallly
UserOwnerUserId
1. USgallly
plurals
PostLinksPostIdRelatedPostId
1. This table or related slice is empty.
PostLinksRelatedPostIdPostId
1. This table or related slice is empty.
PostsAcceptedAnswerId
1. This table or related slice is empty.
PostsParentIdCreationDate
1. This table or related slice is empty.
VotesPostIdCreationDate
1. This table or related slice is empty.
CommentsPostId

Querying!

Guidance

A row detail

Detail views are divided into sections. All the information in the data section comes from columns in the selected row. The other sections display data from other, related rows.

Related data can be related in a to-one or a to-many fashion. Captions of data related in a to-many fashion link to a list view showing a filtered view of the table.

Try moving around until you find a non-empty to-many entry and click on the label to get to one. You can move back to the root by clicking on the database name in the header.