Note that there are some explanatory texts on larger screens.

plurals
  1. PO
    primarykey
    data
    text
    <p><strong>iTextSharp</strong> is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.</p> <pre><code>using System; using System.IO; using iTextSharp.text.pdf; using System.Text.RegularExpressions; namespace Spider.Utils { /// &lt;summary&gt; /// Parses a PDF file and extracts the text from it. /// &lt;/summary&gt; public class PDFParser { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep /// &lt;summary&gt; /// The number of characters to keep, when extracting text. /// &lt;/summary&gt; private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractText /// &lt;summary&gt; /// Extracts a text from a PDF file. /// &lt;/summary&gt; /// &lt;param name="inFileName"&gt;the full path to the pdf file.&lt;/param&gt; /// &lt;param name="outFileName"&gt;the output file name.&lt;/param&gt; /// &lt;returns&gt;the extracted text&lt;/returns&gt; public bool ExtractText(string inFileName, string outFileName) { StreamWriter outFile = null; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten = 0; float curUnit = 0; for (int page = 1; page &lt;= reader.NumberOfPages; page++) { outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "); // Write the progress. if (charUnit &gt;= 1.0f) { for (int i = 0; i &lt; (int)charUnit; i++) { Console.Write("#"); totalWritten++; } } else { curUnit += charUnit; if (curUnit &gt;= 1.0f) { for (int i = 0; i &lt; (int)curUnit; i++) { Console.Write("#"); totalWritten++; } curUnit = 0; } } } if (totalWritten &lt; totalLen) { for (int i = 0; i &lt; (totalLen - totalWritten); i++) { Console.Write("#"); } } return true; } catch { return false; } finally { if (outFile != null) outFile.Close(); } } #endregion #region ExtractTextFromPDFBytes /// &lt;summary&gt; /// This method processes an uncompressed Adobe (text) object /// and extracts text. /// &lt;/summary&gt; /// &lt;param name="input"&gt;uncompressed&lt;/param&gt; /// &lt;returns&gt;&lt;/returns&gt; public string ExtractTextFromPDFBytes(byte[] input) { if (input == null || input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j &lt; _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i &lt; input.Length; i++) { char c = (char)input[i]; if (input[i] == 213) c = "'".ToCharArray()[0]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 &amp;&amp; CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') &amp;&amp; (bracketDepth == 0) &amp;&amp; (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') &amp;&amp; (bracketDepth == 1) &amp;&amp; (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' &amp;&amp; !nextLiteral) { resultString += c.ToString(); nextLiteral = true; } else { if (((c &gt;= ' ') &amp;&amp; (c &lt;= '~')) || ((c &gt;= 128) &amp;&amp; (c &lt; 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j &lt; _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject &amp;&amp; CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return CleanupContent(resultString); } catch { return ""; } } private string CleanupContent(string text) { string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"}; string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" }; for (int i = 0; i &lt; patterns.Length; i++) { string regExPattern = patterns[i]; Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase); text = regex.Replace(text, replace[i]); } return text; } #endregion #region CheckToken /// &lt;summary&gt; /// Check if a certain 2 character token just came along (e.g. BT) /// &lt;/summary&gt; /// &lt;param name="tokens"&gt;the searched token&lt;/param&gt; /// &lt;param name="recent"&gt;the recent character array&lt;/param&gt; /// &lt;returns&gt;&lt;/returns&gt; private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) &amp;&amp; (recent[_numberOfCharsToKeep - 2] == token[1]) &amp;&amp; ((recent[_numberOfCharsToKeep - 1] == ' ') || (recent[_numberOfCharsToKeep - 1] == 0x0d) || (recent[_numberOfCharsToKeep - 1] == 0x0a)) &amp;&amp; ((recent[_numberOfCharsToKeep - 4] == ' ') || (recent[_numberOfCharsToKeep - 4] == 0x0d) || (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } return false; } #endregion } } </code></pre>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. VO
      singulars
      1. This table or related slice is empty.
    2. VO
      singulars
      1. This table or related slice is empty.
    3. VO
      singulars
      1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload