From 4104dc937d1e30881019e5f9f07f2677c14deb65 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 23 Feb 2024 09:07:37 -0500 Subject: [PATCH] Use rtfparse to extract HTML message bodies from RTF containers and create mutlipart/alternative messages if both plain text and HTML are available Also fixes #20. --- README.md | 10 +++++---- outlookmsgfile.py | 57 ++++++++++++++++++++++++++++++++++------------- requirements.txt | 3 ++- setup.py | 5 +++-- 4 files changed, 53 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 17e90d3..9daff01 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,24 @@ Convert Outlook .msg Files to .eml (MIME format) ================================================ -This repository contains a Python 3.6 module for +This repository contains a Python 3.9+ module for reading Microsoft Outlook .msg files and converting them to .eml format, which is the standard MIME format for email messages. Install the dependencies with: - pip3.6 install -r requirements.txt + pip install -r requirements.txt + +(You may need to create and activate a Python virtual environment first.) Then either convert a single file by piping: - python3.6 outlookmsgfile.py < message.msg > message.eml + python outlookmsgfile.py < message.msg > message.eml Or convert a set of files: - python3.6 outlookmsgfile.py *.msg + python outlookmsgfile.py *.msg When passing filenames as command-line arguments, a new file with `.eml` appended to the filename is written out with the message in MIME format. diff --git a/outlookmsgfile.py b/outlookmsgfile.py index fbbe1bc..e98ce77 100644 --- a/outlookmsgfile.py +++ b/outlookmsgfile.py @@ -16,6 +16,7 @@ import re import logging import os import sys +import io from functools import reduce @@ -23,6 +24,8 @@ import email.message, email.parser, email.policy from email.utils import parsedate_to_datetime, formatdate, formataddr import compoundfiles +from rtfparse.parser import Rtf_Parser +from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML logger = logging.getLogger(__name__) @@ -102,33 +105,57 @@ def load_message_stream(entry, is_top_level, doc): msg['Subject'] = props['SUBJECT'] del props['SUBJECT'] - # Add the plain-text body from the BODY field. + # Add a plain text body from the BODY field. + has_body = False if 'BODY' in props: body = props['BODY'] if isinstance(body, str): msg.set_content(body, cte='quoted-printable') else: msg.set_content(body, maintype="text", subtype="plain", cte='8bit') + has_body = True - # Plain-text is not availabe. Use the rich text version. - else: - doc.rtf_attachments += 1 - fn = "messagebody_{}.rtf".format(doc.rtf_attachments) - - msg.set_content( - "".format(fn), - cte='quoted-printable') - + # Add a HTML body from the RTF_COMPRESSED field. + if 'RTF_COMPRESSED' in props: # Decompress the value to Rich Text Format. import compressed_rtf rtf = props['RTF_COMPRESSED'] rtf = compressed_rtf.decompress(rtf) - # Add RTF file as an attachment. - msg.add_attachment( - rtf, - maintype="text", subtype="rtf", - filename=fn) + # Try rtfparse to de-encapsulate HTML stored in a rich + # text container. + try: + rtf_blob = io.BytesIO(rtf) + parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file() + html_stream = io.StringIO() + De_encapsulate_HTML().render(parsed, html_stream) + html_body = html_stream.getvalue() + + if not has_body: + msg.set_content(html_body, subtype="html", cte='quoted-printable') + has_body = True + else: + msg.add_alternative(html_body, subtype="html", cte='quoted-printable') + + # If that fails, just attach the RTF file to the message. + except: + doc.rtf_attachments += 1 + fn = "messagebody_{}.rtf".format(doc.rtf_attachments) + + if not has_body: + msg.set_content( + "".format(fn), + cte='quoted-printable') + has_body = True + + # Add RTF file as an attachment. + msg.add_attachment( + rtf, + maintype="text", subtype="rtf", + filename=fn) + + if not has_body: + msg.set_content("", cte='quoted-printable') # # Copy over string values of remaining properties as headers # # so we don't lose any information. diff --git a/requirements.txt b/requirements.txt index 8e0e5b1..a79e1e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ compoundfiles -compressed-rtf \ No newline at end of file +compressed-rtf +rtfparse # Python 3.9+ only diff --git a/setup.py b/setup.py index 1b9668f..9ce64b5 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ import setuptools install_requires = [ 'compoundfiles', 'compressed_rtf', + 'rtfparse', ] with open("README.md", "r") as fh: @@ -10,7 +11,7 @@ with open("README.md", "r") as fh: setuptools.setup( name='convert-outlook-msg-file', - version='0.1.0', + version='0.2.0', description='Parse Microsoft Outlook MSG files', author='Joshua Tauberer', author_email='jt@occams.info', @@ -19,7 +20,7 @@ setuptools.setup( install_requires=install_requires, long_description=long_description, long_description_content_type="text/markdown", - python_requires='>=3.6', + python_requires='>=3.9', classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",