Use rtfparse to extract HTML message bodies from RTF containers and create mutlipart/alternative messages if both plain text and HTML are available

Also fixes #20.
This commit is contained in:
Joshua Tauberer
2024-02-23 09:07:37 -05:00
parent 6fc382e9a6
commit 4104dc937d
4 changed files with 53 additions and 22 deletions
+6 -4
View File
@@ -1,22 +1,24 @@
Convert Outlook .msg Files to .eml (MIME format) Convert Outlook .msg Files to .eml (MIME format)
================================================ ================================================
This repository contains a Python 3.6 module for This repository contains a Python 3.9+ module for
reading Microsoft Outlook .msg files and converting reading Microsoft Outlook .msg files and converting
them to .eml format, which is the standard MIME them to .eml format, which is the standard MIME
format for email messages. format for email messages.
Install the dependencies with: Install the dependencies with:
pip3.6 install -r requirements.txt pip install -r requirements.txt
(You may need to create and activate a Python virtual environment first.)
Then either convert a single file by piping: Then either convert a single file by piping:
python3.6 outlookmsgfile.py < message.msg > message.eml python outlookmsgfile.py < message.msg > message.eml
Or convert a set of files: Or convert a set of files:
python3.6 outlookmsgfile.py *.msg python outlookmsgfile.py *.msg
When passing filenames as command-line arguments, a new file with `.eml` When passing filenames as command-line arguments, a new file with `.eml`
appended to the filename is written out with the message in MIME format. appended to the filename is written out with the message in MIME format.
+42 -15
View File
@@ -16,6 +16,7 @@ import re
import logging import logging
import os import os
import sys import sys
import io
from functools import reduce from functools import reduce
@@ -23,6 +24,8 @@ import email.message, email.parser, email.policy
from email.utils import parsedate_to_datetime, formatdate, formataddr from email.utils import parsedate_to_datetime, formatdate, formataddr
import compoundfiles import compoundfiles
from rtfparse.parser import Rtf_Parser
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -102,33 +105,57 @@ def load_message_stream(entry, is_top_level, doc):
msg['Subject'] = props['SUBJECT'] msg['Subject'] = props['SUBJECT']
del props['SUBJECT'] del props['SUBJECT']
# Add the plain-text body from the BODY field. # Add a plain text body from the BODY field.
has_body = False
if 'BODY' in props: if 'BODY' in props:
body = props['BODY'] body = props['BODY']
if isinstance(body, str): if isinstance(body, str):
msg.set_content(body, cte='quoted-printable') msg.set_content(body, cte='quoted-printable')
else: else:
msg.set_content(body, maintype="text", subtype="plain", cte='8bit') msg.set_content(body, maintype="text", subtype="plain", cte='8bit')
has_body = True
# Plain-text is not availabe. Use the rich text version. # Add a HTML body from the RTF_COMPRESSED field.
else: if 'RTF_COMPRESSED' in props:
doc.rtf_attachments += 1
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
msg.set_content(
"<no plain text message body --- see attachment {}>".format(fn),
cte='quoted-printable')
# Decompress the value to Rich Text Format. # Decompress the value to Rich Text Format.
import compressed_rtf import compressed_rtf
rtf = props['RTF_COMPRESSED'] rtf = props['RTF_COMPRESSED']
rtf = compressed_rtf.decompress(rtf) rtf = compressed_rtf.decompress(rtf)
# Add RTF file as an attachment. # Try rtfparse to de-encapsulate HTML stored in a rich
msg.add_attachment( # text container.
rtf, try:
maintype="text", subtype="rtf", rtf_blob = io.BytesIO(rtf)
filename=fn) parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
html_stream = io.StringIO()
De_encapsulate_HTML().render(parsed, html_stream)
html_body = html_stream.getvalue()
if not has_body:
msg.set_content(html_body, subtype="html", cte='quoted-printable')
has_body = True
else:
msg.add_alternative(html_body, subtype="html", cte='quoted-printable')
# If that fails, just attach the RTF file to the message.
except:
doc.rtf_attachments += 1
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
if not has_body:
msg.set_content(
"<no plain text message body --- see attachment {}>".format(fn),
cte='quoted-printable')
has_body = True
# Add RTF file as an attachment.
msg.add_attachment(
rtf,
maintype="text", subtype="rtf",
filename=fn)
if not has_body:
msg.set_content("<no message body>", cte='quoted-printable')
# # Copy over string values of remaining properties as headers # # Copy over string values of remaining properties as headers
# # so we don't lose any information. # # so we don't lose any information.
+1
View File
@@ -1,2 +1,3 @@
compoundfiles compoundfiles
compressed-rtf compressed-rtf
rtfparse # Python 3.9+ only
+3 -2
View File
@@ -3,6 +3,7 @@ import setuptools
install_requires = [ install_requires = [
'compoundfiles', 'compoundfiles',
'compressed_rtf', 'compressed_rtf',
'rtfparse',
] ]
with open("README.md", "r") as fh: with open("README.md", "r") as fh:
@@ -10,7 +11,7 @@ with open("README.md", "r") as fh:
setuptools.setup( setuptools.setup(
name='convert-outlook-msg-file', name='convert-outlook-msg-file',
version='0.1.0', version='0.2.0',
description='Parse Microsoft Outlook MSG files', description='Parse Microsoft Outlook MSG files',
author='Joshua Tauberer', author='Joshua Tauberer',
author_email='jt@occams.info', author_email='jt@occams.info',
@@ -19,7 +20,7 @@ setuptools.setup(
install_requires=install_requires, install_requires=install_requires,
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
python_requires='>=3.6', python_requires='>=3.9',
classifiers=[ classifiers=[
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",