清除word格式的C#代碼

jopen 13年前發布 | 1K 次閱讀 iPhone 4S

static void Main(string[] args)
{
    if (args.Length == 0 || String.IsNullOrEmpty(args[0]))
    {
        Console.WriteLine("No filename provided.");
        return;
    }

string filepath = args[0];
if (Path.GetFileName(filepath) == args[0])
{
    filepath = Path.Combine(Environment.CurrentDirectory, filepath);
}
if (!File.Exists(args[0]))
{
    Console.WriteLine("File doesn't exist.");
}

string html = File.ReadAllText(filepath);
Console.WriteLine("input html is "   html.Length   " chars");
html = CleanWordHtml(html);
html = FixEntities(html);           
filepath = Path.GetFileNameWithoutExtension(filepath)   ".modified.htm";           
File.WriteAllText(filepath, html);
Console.WriteLine("cleaned html is "   html.Length   " chars");

}

static string CleanWordHtml(string html) { StringCollection sc = new StringCollection(); // get rid of unnecessary tag spans (comments and title) sc.Add(@"<!--(w|W) ?-->"); sc.Add(@"<title>(w|W) ?</title>"); // Get rid of classes and styles sc.Add(@"s?class=w "); sc.Add(@"s style='[^'] '"); // Get rid of unnecessary tags sc.Add( @"<(meta|link|/?o:|/?style|/?div|/?std|/?head|/?html|body|/?body|/?span|![)[^>]*?>"); // Get rid of empty paragraph tags sc.Add(@"(<[^>] >) (</w >) "); // remove bizarre v: element attached to <img> tag sc.Add(@"s v:w =""[^""] """); // remove extra lines sc.Add(@"(

){2,}"); foreach (string s in sc) { html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); } return html; }

static string FixEntities(string html) { NamueCollection nvc = new NamueCollection(); nvc.Add("“", "“"); nvc.Add("”", "”"); nvc.Add("?", "—"); foreach (string key in nvc.Keys) { html = html.Replace(key, nvc[key]); } return html; }</pre>

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!