From 928a568fe857fe9ce1377e7ae50055620322e0fe Mon Sep 17 00:00:00 2001 From: Steven Murdoch Date: Fri, 3 Jun 2011 23:47:01 +0100 Subject: [PATCH] Fix pdftotext -htmlmeta to correctly output U+2019 in PDF metadata In PDF documents, right single quotation mark (U+2019) may be encoded as 0x90 because PDFDocEncoding uses some of the reserved characters in ISO 8859-1. However, pdftotext -htmlmeta assumes that characters are either UCS-2 or ISO 8859-1. Thus when a right single quotation mark is encoded as 0x90, it is output as unicode 0x90 (which is a control character). pdfinfo does the right thing by first converting from PDFDocEncoding to Unicode with pdfDocEncoding[], before encoding it in the desired character set. This patch applies the same logic to pdftotext. pdftohtml is broken in the same way, but this patch does not attempt to fix it. --- utils/pdftotext.cc | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index 5d1cfb5..eae4058 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -50,6 +50,7 @@ #include "TextOutputDev.h" #include "CharTypes.h" #include "UnicodeMap.h" +#include "PDFDocEncoding.h" #include "Error.h" #include @@ -452,7 +453,7 @@ static void printInfoString(FILE *f, Dict *infoDict, char *key, (s1->getChar(i+1) & 0xff); i += 2; } else { - u = s1->getChar(i) & 0xff; + u = pdfDocEncoding[s1->getChar(i) & 0xff]; ++i; } n = uMap->mapUnicode(u, buf, sizeof(buf)); -- 1.7.3.1