Note that there are some explanatory texts on larger screens.

plurals
  1. PO
    primarykey
    data
    text
    <p>It is not perfect of course, but here is the code I use to convert HTML to plain text.</p> <p>(I was not the original author, I adapted it from code found on the web)</p> <pre><code>public static string ConvertHtmlToText(string source) { string result; // Remove HTML Development formatting // Replace line breaks with space // because browsers inserts space result = source.Replace("\r", " "); // Replace line breaks with space // because browsers inserts space result = result.Replace("\n", " "); // Remove step-formatting result = result.Replace("\t", string.Empty); // Remove repeating speces becuase browsers ignore them result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove the header (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*head([^&gt;])*&gt;", "&lt;head&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(&lt;( )*(/)( )*head( )*&gt;)", "&lt;/head&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(&lt;head&gt;).*(&lt;/head&gt;)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all scripts (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*script([^&gt;])*&gt;", "&lt;script&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(&lt;( )*(/)( )*script( )*&gt;)", "&lt;/script&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //result = System.Text.RegularExpressions.Regex.Replace(result, // @"(&lt;script&gt;)([^(&lt;script&gt;\.&lt;/script&gt;)])*(&lt;/script&gt;)", // string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(&lt;script&gt;).*(&lt;/script&gt;)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all styles (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*style([^&gt;])*&gt;", "&lt;style&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(&lt;( )*(/)( )*style( )*&gt;)", "&lt;/style&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(&lt;style&gt;).*(&lt;/style&gt;)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert tabs in spaces of &lt;td&gt; tags result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*td([^&gt;])*&gt;", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line breaks in places of &lt;BR&gt; and &lt;LI&gt; tags result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*br( )*&gt;", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*li( )*&gt;", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line paragraphs (double line breaks) in place // if &lt;P&gt;, &lt;DIV&gt; and &lt;TR&gt; tags result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*div([^&gt;])*&gt;", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*tr([^&gt;])*&gt;", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;( )*p([^&gt;])*&gt;", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove remaining tags like &lt;a&gt;, links, images, // comments etc - anything thats enclosed inside &lt; &gt; result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;[^&gt;]*&gt;", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // replace special characters: result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;nbsp;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;bull;", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;lsaquo;", "&lt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;rsaquo;", "&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;trade;", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;frasl;", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;", "&lt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&gt;", "&gt;", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;copy;", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;reg;", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove all others. More can be added, see // http://hotwired.lycos.com/webmonkey/reference/special_characters/ result = System.Text.RegularExpressions.Regex.Replace(result, @"&amp;(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // make line breaking consistent result = result.Replace("\n", "\r"); // Remove extra line breaks and tabs: // replace over 2 breaks with 2 and over 4 tabs with 4. // Prepare first to remove any whitespaces inbetween // the escaped characters and remove redundant tabs inbetween linebreaks result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Initial replacement target string for linebreaks string breaks = "\r\r\r"; // Initial replacement target string for tabs string tabs = "\t\t\t\t\t"; for (int index = 0; index &lt; result.Length; index++) { result = result.Replace(breaks, "\r\r"); result = result.Replace(tabs, "\t\t\t\t"); breaks = breaks + "\r"; tabs = tabs + "\t"; } // Thats it. return result; } </code></pre>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. VO
      singulars
      1. This table or related slice is empty.
    2. VO
      singulars
      1. This table or related slice is empty.
    3. VO
      singulars
      1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload