Note that there are some explanatory texts on larger screens.

plurals
  1. POMemory leak in Node.js scraper
    primarykey
    data
    text
    <p>This is a simple scraper written in JavaScript with Node.js, for scraping Wikipedia for periodic table element data. The dependencies are <a href="https://github.com/tmpvar/jsdom" rel="noreferrer">jsdom</a> for DOM manipulation and <a href="https://github.com/technoweenie/node-chain-gang" rel="noreferrer">chain-gang</a> for queuing. </p> <p>It works fine, most of the time (it doesn't handle errors gracefully), and the code isn't too bad, dare I say for a for attempt, but there is a serious fault with it - it leaks memory horribly, anywhere from 0.3% to 0.6% of the computer's memory for each element, such that by the time it gets to lead it would be using somewhere close to 20%, which is plainly unacceptable. </p> <p>I've tried working with profilers, but I have either not found them to be helpful or have difficulty interpreting the data. I suspect it has something to do with the way <code>processElement</code> gets passed around, but I have difficulty in rewriting the queue code into something more elegant. </p> <pre><code>var fs = require('fs'), path = require('path'), jsdom = require("jsdom"), parseUrl = require('url').parse, chainGang = require('chain-gang'); var chain = chainGang.create({ workers: 1 }); var Settings = { periodicUrl: 'http://en.wikipedia.org/wiki/Template:Periodic_table', periodicSelector: '#bodyContent &gt; table:first', pathPrefix: 'data/', ignoredProperties: ['Pronunciation'] }; function writeToFile(output) { var keys = 0; // Huge nests for finding the name of the element... yeah for(var i in output) { if(typeof output[i] === 'object' &amp;&amp; output[i] !== null){ for(var l in output[i]) { if(l.toLowerCase() === 'name') { var name = output[i][l]; } } keys += Object.keys(output[i]).length; } } console.log('Scraped ' + keys + ' properties for ' + name); console.log('Writing to ' + Settings.pathPrefix + name + '.json'); fs.writeFile(Settings.pathPrefix + name + '.json', JSON.stringify(output)); } // Generic create task function to create a task function that // would be passed to the chain gang function createTask (url, callback) { console.log('Task added - ' + url); return function(worker){ console.log('Requesting: ' +url); jsdom.env(url, [ 'jquery.min.js' // Local copy of jQuery ], function(errors, window) { if(errors){ console.log('Error! ' + errors) createTask(url, callback); } else { // Give me thy $ var $ = window.$; // Cleanup - remove unneeded elements $.fn.cleanup = function() { return this.each(function(){ $(this).find('sup.reference, .IPA').remove().end() .find('a, b, i, small, span').replaceWith(function(){ return this.innerHTML; }).end() .find('br').replaceWith(' '); }); } callback($); } worker.finish(); }); } } function processElement ($){ var infoBox = $('.infobox'), image = infoBox.find('tr:contains("Appearance") + tr img:first'), description = $('#toc').prevAll('p').cleanup(), headers = infoBox.find('tr:contains("properties")'), output = { Appearance: image.attr('src'), Description: $('.infobox + p').cleanup().html() }; headers.each(function(){ var that = this, title = this.textContent.trim(), rowspan = 0, rowspanHeading = ''; output[title] = {}; $(this).nextUntil('tr:has(th:only-child)').each(function(){ var t = $(this).cleanup(), headingEle = t.children('th'), data = t.children('td').html().trim(); if(headingEle.length) { var heading = headingEle.html().trim(); } // Skip to next heading if current property is ignored if(~Settings.ignoredProperties.indexOf(heading)) { return true; } if (rowspan) { output[title][rowspanHeading][data.split(':')[0].trim()] = data.split(':')[1].trim(); rowspan--; } else if (headingEle.attr('rowspan')){ rowspan = headingEle.attr('rowspan') - 1; rowspanHeading = heading; output[title][heading] = {}; output[title][heading][data.split(':')[0]] = data.split(':')[1]; } else if (~heading.indexOf(',')){ data = data.split(','); heading.split(',').forEach(function(v, i){ output[title][v.trim()] = data[i].trim(); }); } else { output[title][heading] = data; } }); }); writeToFile(output); } function fetchElements(elements) { elements.forEach(function(value){ // Element URL used here as task id (second argument) chain.add(createTask(value, processElement), value); }); } function processTable($){ var elementArray = $(Settings.periodicSelector).find('td').map(function(){ var t = $(this), atomicN = parseInt(t.text(), 10); if(atomicN &amp;&amp; t.children('a').length) { var elementUrl = 'http://' + parseUrl(Settings.periodicUrl).host + t.children('a:first').attr('href'); console.log(atomicN, t.children('a:first').attr('href').split('/').pop(), elementUrl); return elementUrl; } }).get(); fetchElements(elementArray); fs.writeFile(Settings.pathPrefix + 'elements.json', JSON.stringify(elementArray)); } // Get table - init function getPeriodicList(){ var elementsList = Settings.pathPrefix + 'elements.json'; if(path.existsSync(elementsList)){ var fileData = JSON.parse(fs.readFileSync(elementsList, 'utf8')); fetchElements(fileData); } else { chain.add(createTask(Settings.periodicUrl, processTable)); } } getPeriodicList(); </code></pre>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload