Note that there are some explanatory texts on larger screens.

plurals
  1. POImplementing a web scraper in c#
    text
    copied!<p>I'm developing a web scraper, but I need to persist cookies between requests much like I can do in PHP using curl. However, it seems that if I try to use a <code>CookieContainer</code> object in C#, it doesn't grab all of the cookies from the response and send them to the next request. </p> <p>Here's my C# class:</p> <pre><code> public class Scraper { public string Username { get; set; } public string Password { get; set; } public string UserAgent { get; set; } public string ContentType { get; set; } public CookieCollection Cookies { get; set; } public CookieContainer Container { get; set; } public Scraper() { UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0"; ContentType = "application/x-www-form-urlencoded"; Cookies = new CookieCollection(); Container = new CookieContainer(); } public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "") { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); request.CookieContainer = Container; request.CookieContainer.Add(Cookies); request.UserAgent = UserAgent; request.AllowWriteStreamBuffering = true; request.ProtocolVersion = HttpVersion.Version11; request.AllowAutoRedirect = true; request.ContentType = ContentType; request.PreAuthenticate = true; if (requestedwith.Length &gt; 0) request.Headers["X-Requested-With"] = requestedwith; if (host.Length &gt; 0) request.Host = host; if (referer.Length &gt; 0) request.Referer = referer; if (timeout &gt; 0) request.Timeout = timeout; if (creds != null) request.Credentials = creds; if (postData.Length &gt; 0) { request.Method = "POST"; ASCIIEncoding encoding = new ASCIIEncoding(); byte[] data = encoding.GetBytes(postData); request.ContentLength = data.Length; Stream newStream = request.GetRequestStream(); //open connection newStream.Write(data, 0, data.Length); // Send the data. newStream.Close(); } else request.Method = "GET"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Cookies = response.Cookies; StringBuilder page; using (StreamReader sr = new StreamReader(response.GetResponseStream())) { page = new StringBuilder(sr.ReadToEnd()); page = page.Replace("\r\n", ""); // strip all new lines and tabs page = page.Replace("\r", ""); // strip all new lines and tabs page = page.Replace("\n", ""); // strip all new lines and tabs page = page.Replace("\t", ""); // strip all new lines and tabs } string str = page.ToString(); str = Regex.Replace(str, @"&gt;\s+&lt;", "&gt;&lt;"); return str; } } </code></pre> <p>Here's my PHP code for loading and maintaining cookies in a cookie jar:</p> <pre><code> private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE) { $useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this-&gt;locale . "; rv:1.9.2.10) Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)"; $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($curl, CURLOPT_HEADER, FALSE); if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest')); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8'); curl_setopt($curl, CURLOPT_USERAGENT, $useragent); curl_setopt($curl, CURLOPT_POST, !empty($postData)); if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData); curl_setopt($curl, CURLOPT_COOKIEFILE, $this-&gt;cookieFile); curl_setopt($curl, CURLOPT_COOKIEJAR, $this-&gt;cookieFile); $page = curl_exec ($curl); $page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs $page = preg_replace('~&gt;\s+&lt;~', '&gt;&lt;', $page);// strip all whitespace between tags curl_close ($curl); return $page; } </code></pre> <p>How do I successfully maintain cookies between requests?</p>
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload