Use rtfparse to extract HTML message bodies from RTF containers and create mutlipart/alternative messages if both plain text and HTML are available
Also fixes #20.
This commit is contained in:
@@ -1,22 +1,24 @@
|
|||||||
Convert Outlook .msg Files to .eml (MIME format)
|
Convert Outlook .msg Files to .eml (MIME format)
|
||||||
================================================
|
================================================
|
||||||
|
|
||||||
This repository contains a Python 3.6 module for
|
This repository contains a Python 3.9+ module for
|
||||||
reading Microsoft Outlook .msg files and converting
|
reading Microsoft Outlook .msg files and converting
|
||||||
them to .eml format, which is the standard MIME
|
them to .eml format, which is the standard MIME
|
||||||
format for email messages.
|
format for email messages.
|
||||||
|
|
||||||
Install the dependencies with:
|
Install the dependencies with:
|
||||||
|
|
||||||
pip3.6 install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
(You may need to create and activate a Python virtual environment first.)
|
||||||
|
|
||||||
Then either convert a single file by piping:
|
Then either convert a single file by piping:
|
||||||
|
|
||||||
python3.6 outlookmsgfile.py < message.msg > message.eml
|
python outlookmsgfile.py < message.msg > message.eml
|
||||||
|
|
||||||
Or convert a set of files:
|
Or convert a set of files:
|
||||||
|
|
||||||
python3.6 outlookmsgfile.py *.msg
|
python outlookmsgfile.py *.msg
|
||||||
|
|
||||||
When passing filenames as command-line arguments, a new file with `.eml`
|
When passing filenames as command-line arguments, a new file with `.eml`
|
||||||
appended to the filename is written out with the message in MIME format.
|
appended to the filename is written out with the message in MIME format.
|
||||||
|
|||||||
+37
-10
@@ -16,6 +16,7 @@ import re
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import io
|
||||||
|
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
@@ -23,6 +24,8 @@ import email.message, email.parser, email.policy
|
|||||||
from email.utils import parsedate_to_datetime, formatdate, formataddr
|
from email.utils import parsedate_to_datetime, formatdate, formataddr
|
||||||
|
|
||||||
import compoundfiles
|
import compoundfiles
|
||||||
|
from rtfparse.parser import Rtf_Parser
|
||||||
|
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -102,34 +105,58 @@ def load_message_stream(entry, is_top_level, doc):
|
|||||||
msg['Subject'] = props['SUBJECT']
|
msg['Subject'] = props['SUBJECT']
|
||||||
del props['SUBJECT']
|
del props['SUBJECT']
|
||||||
|
|
||||||
# Add the plain-text body from the BODY field.
|
# Add a plain text body from the BODY field.
|
||||||
|
has_body = False
|
||||||
if 'BODY' in props:
|
if 'BODY' in props:
|
||||||
body = props['BODY']
|
body = props['BODY']
|
||||||
if isinstance(body, str):
|
if isinstance(body, str):
|
||||||
msg.set_content(body, cte='quoted-printable')
|
msg.set_content(body, cte='quoted-printable')
|
||||||
else:
|
else:
|
||||||
msg.set_content(body, maintype="text", subtype="plain", cte='8bit')
|
msg.set_content(body, maintype="text", subtype="plain", cte='8bit')
|
||||||
|
has_body = True
|
||||||
|
|
||||||
# Plain-text is not availabe. Use the rich text version.
|
# Add a HTML body from the RTF_COMPRESSED field.
|
||||||
else:
|
if 'RTF_COMPRESSED' in props:
|
||||||
doc.rtf_attachments += 1
|
|
||||||
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
|
|
||||||
|
|
||||||
msg.set_content(
|
|
||||||
"<no plain text message body --- see attachment {}>".format(fn),
|
|
||||||
cte='quoted-printable')
|
|
||||||
|
|
||||||
# Decompress the value to Rich Text Format.
|
# Decompress the value to Rich Text Format.
|
||||||
import compressed_rtf
|
import compressed_rtf
|
||||||
rtf = props['RTF_COMPRESSED']
|
rtf = props['RTF_COMPRESSED']
|
||||||
rtf = compressed_rtf.decompress(rtf)
|
rtf = compressed_rtf.decompress(rtf)
|
||||||
|
|
||||||
|
# Try rtfparse to de-encapsulate HTML stored in a rich
|
||||||
|
# text container.
|
||||||
|
try:
|
||||||
|
rtf_blob = io.BytesIO(rtf)
|
||||||
|
parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
|
||||||
|
html_stream = io.StringIO()
|
||||||
|
De_encapsulate_HTML().render(parsed, html_stream)
|
||||||
|
html_body = html_stream.getvalue()
|
||||||
|
|
||||||
|
if not has_body:
|
||||||
|
msg.set_content(html_body, subtype="html", cte='quoted-printable')
|
||||||
|
has_body = True
|
||||||
|
else:
|
||||||
|
msg.add_alternative(html_body, subtype="html", cte='quoted-printable')
|
||||||
|
|
||||||
|
# If that fails, just attach the RTF file to the message.
|
||||||
|
except:
|
||||||
|
doc.rtf_attachments += 1
|
||||||
|
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
|
||||||
|
|
||||||
|
if not has_body:
|
||||||
|
msg.set_content(
|
||||||
|
"<no plain text message body --- see attachment {}>".format(fn),
|
||||||
|
cte='quoted-printable')
|
||||||
|
has_body = True
|
||||||
|
|
||||||
# Add RTF file as an attachment.
|
# Add RTF file as an attachment.
|
||||||
msg.add_attachment(
|
msg.add_attachment(
|
||||||
rtf,
|
rtf,
|
||||||
maintype="text", subtype="rtf",
|
maintype="text", subtype="rtf",
|
||||||
filename=fn)
|
filename=fn)
|
||||||
|
|
||||||
|
if not has_body:
|
||||||
|
msg.set_content("<no message body>", cte='quoted-printable')
|
||||||
|
|
||||||
# # Copy over string values of remaining properties as headers
|
# # Copy over string values of remaining properties as headers
|
||||||
# # so we don't lose any information.
|
# # so we don't lose any information.
|
||||||
# for k, v in props.items():
|
# for k, v in props.items():
|
||||||
|
|||||||
@@ -1,2 +1,3 @@
|
|||||||
compoundfiles
|
compoundfiles
|
||||||
compressed-rtf
|
compressed-rtf
|
||||||
|
rtfparse # Python 3.9+ only
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import setuptools
|
|||||||
install_requires = [
|
install_requires = [
|
||||||
'compoundfiles',
|
'compoundfiles',
|
||||||
'compressed_rtf',
|
'compressed_rtf',
|
||||||
|
'rtfparse',
|
||||||
]
|
]
|
||||||
|
|
||||||
with open("README.md", "r") as fh:
|
with open("README.md", "r") as fh:
|
||||||
@@ -10,7 +11,7 @@ with open("README.md", "r") as fh:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='convert-outlook-msg-file',
|
name='convert-outlook-msg-file',
|
||||||
version='0.1.0',
|
version='0.2.0',
|
||||||
description='Parse Microsoft Outlook MSG files',
|
description='Parse Microsoft Outlook MSG files',
|
||||||
author='Joshua Tauberer',
|
author='Joshua Tauberer',
|
||||||
author_email='jt@occams.info',
|
author_email='jt@occams.info',
|
||||||
@@ -19,7 +20,7 @@ setuptools.setup(
|
|||||||
install_requires=install_requires,
|
install_requires=install_requires,
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
python_requires='>=3.6',
|
python_requires='>=3.9',
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"License :: OSI Approved :: MIT License",
|
"License :: OSI Approved :: MIT License",
|
||||||
|
|||||||
Reference in New Issue
Block a user