Word Doc to Simple HTML Converter

Costas

Administrator
Staff member
JavaScript:
//src - http://www.geoffstratton.com/cnet-docdocx-simple-html-converter
//always you can use 
// object wordApp = Activator.CreateInstance(Type.GetTypeFromProgID("Word.Application"));
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using Microsoft.Office.Interop.Word;
using System.IO;
using System.Diagnostics;
using HtmlAgilityPack;
 
namespace DocConverter
{
    public partial class docForm : Form
    {
        public docForm()
        {
            InitializeComponent();
            this.AllowDrop = true;
            this.DragEnter += new DragEventHandler(docForm_DragEnter);
            this.DragDrop += new DragEventHandler(docForm_DragDrop);
        }
 
        void docForm_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop)) e.Effect = DragDropEffects.Copy;
        }
 
        void docForm_DragDrop(object sender, DragEventArgs e)
        {
            // Gives us the path to the file
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop);
             
            // Invoke Word, open doc by path, do doc.SaveAs to generate HTML
            Microsoft.Office.Interop.Word.Application application = new
                Microsoft.Office.Interop.Word.Application();
 
            Document doc = application.Documents.Open(files[0]);
            string result = Path.GetTempPath();
            //More "complete" but worse HTML
            //doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatHTML);
            doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatFilteredHTML);
            doc.Close();
 
            // Close Word
            application.Quit();
 
            // Now, clean up Word's HTML using Html Agility Pack
            HtmlAgilityPack.HtmlDocument mangledHTML = new HtmlAgilityPack.HtmlDocument();
            mangledHTML.Load(result + "temp.html");
             
            //Uncomment to see results so far
            //Process.Start("notepad.exe", result + "temp.html");
             
            //"Blacklisted" tags and all inclusive data will be removed completely
            //"Stripped" tags will have all attributes removed, so 
 becomes 

            string[] blacklistedTags = { "span", "head" };
            string[] strippedTags = { "body", "div", "p", "strong", "ul", "li", "table", "tr", "td" };
             
            foreach(var blackTag in blacklistedTags) 
            {
                try
                {
                    foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + blackTag))
                    {
                        item.ParentNode.RemoveChild(item);
                    }
                }
                catch (NullReferenceException)
                {
                    // No tags of that type; skip it and move on
                    continue;
                }
            }
 
            foreach(var stripTag in strippedTags)
            {
                try
                {
                    foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + stripTag))
                    {
                        item.Attributes.RemoveAll();
                    }
                }
                catch (NullReferenceException)
                {
                    // No tags of that type; skip it and move on
                    continue;
                }
            }
 
            mangledHTML.Save(result + "newtemp.html");
 
            // Remove standalone CRLF 
            string badHTML = File.ReadAllText(result + "newtemp.html");
            badHTML = badHTML.Replace("\r\n\r\n", "ackThbbtt");
            badHTML = badHTML.Replace("\r\n", "");
            badHTML = badHTML.Replace("ackThbbtt", "\r\n");
            File.WriteAllText(result + "finaltemp.html", badHTML);
    
            // Clean up temp files, show the finished result in Notepad
            File.Delete(result + "temp.html");
            File.Delete(result + "newtemp.html");
            Process.Start("notepad.exe", result + "finaltemp.html");
        }
 
    }
 
}
 
Top