在现代的软件开发中,Docx到Html的转换功能越来越受到重视,尤其是在内容管理系统(CMS)的开发中。本文旨在为C#开发者提供Docx到Html转换的解决方案,并整合了全球开发者的优秀解决方案,以减少开发者在寻找常见问题解决方案时的搜索时间。
为了实现Docx到Html的转换,需要以下工具和库:
以上提到的DLL文件可以在本文附带的项目中找到。
Docx到Html的转换需求日益增长,尤其是在开发WYSIWYG编辑器时。例如,本文所使用的编辑器如果能够直接从Docx文件上传内容,将大大提高效率。Docx文件本质上是一个打包文件,类似于标准的zip文件,具有标准化的结构。
Docx文件的详细打包结构可以在以下链接中查看:
将Docx文件转换为Html内容的过程相对简单。以下是一个C#代码示例:
DocxToHTML.Converter.HTMLConverter converter = new DocxToHTML.Converter.HTMLConverter();
string htmlContent = converter.ConvertToHtml("YOUR-DOCX-FILE");
在ASP.NET应用程序中,可以直接将转换后的Html内容发送给客户端。为了演示,本文在WinForm的WebBrowser控件中使用CKEditor控件显示输出。
在解析Docx内容时,需要检查是否有损坏的超链接,这可能会导致异常。以下代码用于处理该异常:
string htmlText = string.Empty;
try
{
htmlText = ParseDOCX(fileInfo);
}
catch (OpenXmlPackageException e)
{
if (e.ToString().Contains("Invalid Hyperlink"))
{
using (FileStream fs = new FileStream(fullFilePath, FileMode.OpenOrCreate, FileAccess.ReadWrite))
{
UriFixer.FixInvalidUri(fs, brokenUri => FixUri(brokenUri));
}
htmlText = ParseDOCX(fileInfo);
}
}
return htmlText;
实际的解析过程如下:
private string ParseDOCX(FileInfo fileInfo)
{
try
{
byte[] byteArray = File.ReadAllBytes(fileInfo.FullName);
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write(byteArray, 0, byteArray.Length);
using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true))
{
int imageCounter = 0;
var pageTitle = fileInfo.FullName;
var part = wDoc.CoreFilePropertiesPart;
if (part != null)
{
pageTitle = (string)part.GetXDocument().Descendants(DC.title).FirstOrDefault() ?? fileInfo.FullName;
}
WmlToHtmlConverterSettings settings = new WmlToHtmlConverterSettings()
{
AdditionalCss = "body { margin: 1cm auto; max-width: 20cm; padding: 0; }",
PageTitle = pageTitle,
FabricateCssClasses = true,
CssClassPrefix = "pt-",
RestrictToSupportedLanguages = false,
RestrictToSupportedNumberingFormats = false,
ImageHandler = imageInfo =>
{
++imageCounter;
string extension = imageInfo.ContentType.Split('/')[1].ToLower();
ImageFormat imageFormat = null;
if (extension == "png") imageFormat = ImageFormat.Png;
else if (extension == "gif") imageFormat = ImageFormat.Gif;
else if (extension == "bmp") imageFormat = ImageFormat.Bmp;
else if (extension == "jpeg") imageFormat = ImageFormat.Jpeg;
else if (extension == "tiff")
{
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "x-wmf")
{
extension = "wmf";
imageFormat = ImageFormat.Wmf;
}
if (imageFormat == null)
return null;
string base64 = null;
try
{
using (MemoryStream ms = new MemoryStream())
{
imageInfo.Bitmap.Save(ms, imageFormat);
var ba = ms.ToArray();
base64 = System.Convert.ToBase64String(ba);
}
}
catch (System.Runtime.InteropServices.ExternalException)
{
return null;
}
ImageFormat format = imageInfo.Bitmap.RawFormat;
ImageCodecInfo codec = ImageCodecInfo.GetImageDecoders().First(c => c.FormatID == format.Guid);
string mimeType = codec.MimeType;
string imageSource = string.Format("data:{0};base64,{1}", mimeType, base64);
XElement img = new XElement(Xhtml.img,
new XAttribute(NoNamespace.src, imageSource),
imageInfo.ImgStyleAttribute,
imageInfo.AltText != null ?
new XAttribute(NoNamespace.alt, imageInfo.AltText) :
null);
return img;
}
};
XElement htmlElement = WmlToHtmlConverter.ConvertToHtml(wDoc, settings);
var html = new XDocument(
new XDocumentType("html", null, null, null),
htmlElement);
var htmlString = html.ToString(SaveOptions.DisableFormatting);
return htmlString;
}
}
}
catch
{
return "File contains corrupt data";
}
}
修复URI的代码如下:
private static string FixUri(string brokenUri)
{
string newURI = string.Empty;
if (brokenUri.Contains("mailto:"))
{
int mailToCount = "mailto:".Length;
brokenUri = brokenUri.Remove(0, mailToCount);
newURI = brokenUri;
}
else
{
newURI = "";
}
return newURI;
}