HTML tag, which fails when the title is UCS-2 encoded, or if it contains characters which are in pdfDocEncoding (a ISO 8859-1 superset), but not in ISO 8859-1. This patch fixes the problem by decoding UCS-2 or pdfDocEncoding into Unicode, then encoding this in the desired output encoding. HTML escaping wasn't being done either, so I have used the existing function HtmlFont::HtmlFilter to perform both HTML escaping and character set encoding. This static method had to be made public to call it from pdftohtml. See bug #37900. --- utils/HtmlFonts.h | 2 +- utils/pdftohtml.cc | 40 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h index a0ca78a..0ec8fa6 100644 --- a/utils/HtmlFonts.h +++ b/utils/HtmlFonts.h @@ -65,7 +65,6 @@ class HtmlFont{ static GooString *DefaultFont; GooString *FontName; HtmlFontColor color; - static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); public: HtmlFont(){FontName=NULL;}; @@ -84,6 +83,7 @@ public: GooString* getFontName(); static GooString* getDefaultFont(); static void setDefaultFont(GooString* defaultFont); + static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); GBool isEqual(const HtmlFont& x) const; GBool isEqualIgnoreBold(const HtmlFont& x) const; static GooString* simple(HtmlFont *font, Unicode *content, int uLen); diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index b46bf1b..bb74f61 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -53,6 +53,7 @@ #endif #include "PSOutputDev.h" #include "GlobalParams.h" +#include "PDFDocEncoding.h" #include "Error.h" #include "DateInfo.h" #include "goo/gfile.h" @@ -511,13 +512,46 @@ int main(int argc, char *argv[]) { static GooString* getInfoString(Dict *infoDict, char *key) { Object obj; - GooString *s1 = NULL; + // Raw value as read from PDF (may be in pdfDocEncoding or UCS2) + GooString *rawString; + // Value converted to unicode + Unicode *unicodeString; + int unicodeLength; + // Value HTML escaped and converted to desired encoding + GooString *encodedString = NULL; + // Is rawString UCS2 (as opposed to pdfDocEncoding) + GBool isUnicode; if (infoDict->lookup(key, &obj)->isString()) { - s1 = new GooString(obj.getString()); + rawString = obj.getString(); + + // Convert rawString to unicode + encodedString = new GooString(); + if (rawString->hasUnicodeMarker()) { + isUnicode = gTrue; + unicodeLength = (obj.getString()->getLength() - 2) / 2; + } else { + isUnicode = gFalse; + unicodeLength = obj.getString()->getLength(); + } + unicodeString = new Unicode[unicodeLength]; + + for (int i=0; i<unicodeLength; i++) { + if (isUnicode) { + unicodeString[i] = ((rawString->getChar((i+1)*2) & 0xff) << 8) | + (rawString->getChar(((i+1)*2)+1) & 0xff); + } else { + unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff]; + } + } + + // HTML escape and encode unicode + encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength); + delete[] unicodeString; } + obj.free(); - return s1; + return encodedString; } static GooString* getInfoDate(Dict *infoDict, char *key) { -- 1.7.3.1

From a604ae4e7afe43417fa9d63f597072b70c4eb189 Mon Sep 17 00:00:00 2001 From: Steven Murdoch Date: Tue, 7 Jun 2011 01:03:09 +0100 Subject: [PATCH] Fix encoding of PDF document metadata in output of pdftohtml pdftohtml simply copies the PDF document title into the HTML tag, which fails when the title is UCS-2 encoded, or if it contains characters which are in pdfDocEncoding (a ISO 8859-1 superset), but not in ISO 8859-1. This patch fixes the problem by decoding UCS-2 or pdfDocEncoding into Unicode, then encoding this in the desired output encoding. HTML escaping wasn't being done either, so I have used the existing function HtmlFont::HtmlFilter to perform both HTML escaping and character set encoding. This static method had to be made public to call it from pdftohtml. See bug #37900. --- utils/HtmlFonts.h | 2 +- utils/pdftohtml.cc | 40 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h index a0ca78a..0ec8fa6 100644 --- a/utils/HtmlFonts.h +++ b/utils/HtmlFonts.h @@ -65,7 +65,6 @@ class HtmlFont{ static GooString *DefaultFont; GooString *FontName; HtmlFontColor color; - static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); public: HtmlFont(){FontName=NULL;}; @@ -84,6 +83,7 @@ public: GooString* getFontName(); static GooString* getDefaultFont(); static void setDefaultFont(GooString* defaultFont); + static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); GBool isEqual(const HtmlFont& x) const; GBool isEqualIgnoreBold(const HtmlFont& x) const; static GooString* simple(HtmlFont *font, Unicode *content, int uLen); diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index b46bf1b..bb74f61 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -53,6 +53,7 @@ #endif #include "PSOutputDev.h" #include "GlobalParams.h" +#include "PDFDocEncoding.h" #include "Error.h" #include "DateInfo.h" #include "goo/gfile.h" @@ -511,13 +512,46 @@ int main(int argc, char *argv[]) { static GooString* getInfoString(Dict *infoDict, char *key) { Object obj; - GooString *s1 = NULL; + // Raw value as read from PDF (may be in pdfDocEncoding or UCS2) + GooString *rawString; + // Value converted to unicode + Unicode *unicodeString; + int unicodeLength; + // Value HTML escaped and converted to desired encoding + GooString *encodedString = NULL; + // Is rawString UCS2 (as opposed to pdfDocEncoding) + GBool isUnicode; if (infoDict->lookup(key, &obj)->isString()) { - s1 = new GooString(obj.getString()); + rawString = obj.getString(); + + // Convert rawString to unicode + encodedString = new GooString(); + if (rawString->hasUnicodeMarker()) { + isUnicode = gTrue; + unicodeLength = (obj.getString()->getLength() - 2) / 2; + } else { + isUnicode = gFalse; + unicodeLength = obj.getString()->getLength(); + } + unicodeString = new Unicode[unicodeLength]; + + for (int i=0; i<unicodeLength; i++) { + if (isUnicode) { + unicodeString[i] = ((rawString->getChar((i+1)*2) & 0xff) << 8) | + (rawString->getChar(((i+1)*2)+1) & 0xff); + } else { + unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff]; + } + } + + // HTML escape and encode unicode + encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength); + delete[] unicodeString; } + obj.free(); - return s1; + return encodedString; } static GooString* getInfoDate(Dict *infoDict, char *key) { -- 1.7.3.1