Decode byte strings in .msg files correctly

Non-Unicode strings in .msg files are encoded using an encoding that is
defined in a separate message property (PR_INTERNET_CPID for the body,
PR_MESSAGE_CODEPAGE for everything else).

The specification says that this property is required, however some real
world .msg files do not have it. This is why the decoding code has a
fallback to "cp1252" (Windows code page 1252, "Western Europe").

fixes #24
This commit is contained in:
Martijn van de Streek
2024-02-22 11:25:12 +01:00
committed by Martijn van de Streek
parent 6f1a6e4b4a
commit 674896d603
+71 -4
View File
@@ -192,6 +192,7 @@ def parse_properties(properties, is_top_level, container, doc):
# Read 16-byte entries. # Read 16-byte entries.
ret = { } ret = { }
tags_to_decode = []
while i < len(stream): while i < len(stream):
# Read the entry. # Read the entry.
property_type = stream[i+0:i+2] property_type = stream[i+0:i+2]
@@ -225,6 +226,9 @@ def parse_properties(properties, is_top_level, container, doc):
logger.error("stream missing {}".format(streamname)) logger.error("stream missing {}".format(streamname))
continue continue
if isinstance(tag_type, STRING8):
tags_to_decode.append(tag_name)
value = tag_type.load(value) value = tag_type.load(value)
elif isinstance(tag_type, EMBEDDED_MESSAGE): elif isinstance(tag_type, EMBEDDED_MESSAGE):
@@ -248,6 +252,37 @@ def parse_properties(properties, is_top_level, container, doc):
ret[tag_name] = value ret[tag_name] = value
# Post-processing: decode String8 strings using the code pages defined
# in properties.
#
# These properties are required to be present but in the real world they
# aren't always there, so there's a fallback to code page 1252 (Western
# Europe)
# The encoding of the "BODY" (and HTML body) properties
if "PR_INTERNET_CPID" in ret:
body_codepage = code_pages.get(ret['PR_INTERNET_CPID'], 'cp1252')
else:
body_codepage = 'cp1252'
# The encoding of "string properties of the message object"
if "PR_MESSAGE_CODEPAGE" in ret:
message_codepage = code_pages.get(ret['PR_MESSAGE_CODEPAGE'], 'cp1252')
else:
message_codepage = 'cp1252'
for tag_name in tags_to_decode:
decoded = False
if tag_name == "BODY":
try:
ret[tag_name] = ret[tag_name].decode(body_codepage)
decoded = True
except UnicodeDecodeError:
pass
if not decoded:
ret[tag_name] = ret[tag_name].decode(message_codepage, errors='replace')
return ret return ret
@@ -315,10 +350,9 @@ class BINARY(VariableLengthValueLoader):
class STRING8(VariableLengthValueLoader): class STRING8(VariableLengthValueLoader):
@staticmethod @staticmethod
def load(value): def load(value):
# value is a bytestring. I haven't seen specified what character encoding # Value is a "bytestring"
# is used when the Unicode storage type is not used, so we'll assume it's # Decoding will be done at a later stage, once the code page/codec is known
# ASCII or Latin-1 like but we'll use UTF-8 to cover the bases. return value
return value.decode("utf8")
class UNICODE(VariableLengthValueLoader): class UNICODE(VariableLengthValueLoader):
@staticmethod @staticmethod
@@ -817,8 +851,41 @@ property_tags = {
0x3F06: ('YPOS', 'I4'), 0x3F06: ('YPOS', 'I4'),
0x3F07: ('CONTROL_ID', 'BINARY'), 0x3F07: ('CONTROL_ID', 'BINARY'),
0x3F08: ('INITIAL_DETAILS_PANE', 'I4'), 0x3F08: ('INITIAL_DETAILS_PANE', 'I4'),
0x3FDE: ('PR_INTERNET_CPID', 'I4'),
0x3FFD: ('PR_MESSAGE_CODEPAGE', 'I4'),
} }
code_pages = {
# Microsoft code page id: python codec name
437: "cp437",
850: "cp850",
852: "cp852",
936: "gb2312",
1250: "cp1250",
1251: "cp1251",
1252: "cp1252",
1253: "cp1253",
1254: "cp1254",
1255: "cp1255",
1256: "cp1256",
1257: "cp1257",
1258: "cp1258",
20127: "ascii",
20866: "koi8-r",
21866: "koi8-u",
28591: "iso8859_1",
28592: "iso8859_2",
28593: "iso8859_3",
28594: "iso8859_4",
28595: "iso8859_5",
28596: "iso8859_6",
28597: "iso8859_7",
28598: "iso8859_8",
28599: "iso8859_9",
28603: "iso8859_13",
28605: "iso8859_15",
65001: "utf-8",
}
# COMMAND-LINE ENTRY POINT # COMMAND-LINE ENTRY POINT