Use rtfparse to extract HTML message bodies from RTF containers and create mutlipart/alternative messages if both plain text and HTML are available

Also fixes #20.
This commit is contained in:
Joshua Tauberer
2024-02-23 09:07:37 -05:00
parent 6fc382e9a6
commit 4104dc937d
4 changed files with 53 additions and 22 deletions
+6 -4
View File
@@ -1,22 +1,24 @@
Convert Outlook .msg Files to .eml (MIME format)
================================================
This repository contains a Python 3.6 module for
This repository contains a Python 3.9+ module for
reading Microsoft Outlook .msg files and converting
them to .eml format, which is the standard MIME
format for email messages.
Install the dependencies with:
pip3.6 install -r requirements.txt
pip install -r requirements.txt
(You may need to create and activate a Python virtual environment first.)
Then either convert a single file by piping:
python3.6 outlookmsgfile.py < message.msg > message.eml
python outlookmsgfile.py < message.msg > message.eml
Or convert a set of files:
python3.6 outlookmsgfile.py *.msg
python outlookmsgfile.py *.msg
When passing filenames as command-line arguments, a new file with `.eml`
appended to the filename is written out with the message in MIME format.
+42 -15
View File
@@ -16,6 +16,7 @@ import re
import logging
import os
import sys
import io
from functools import reduce
@@ -23,6 +24,8 @@ import email.message, email.parser, email.policy
from email.utils import parsedate_to_datetime, formatdate, formataddr
import compoundfiles
from rtfparse.parser import Rtf_Parser
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
logger = logging.getLogger(__name__)
@@ -102,33 +105,57 @@ def load_message_stream(entry, is_top_level, doc):
msg['Subject'] = props['SUBJECT']
del props['SUBJECT']
# Add the plain-text body from the BODY field.
# Add a plain text body from the BODY field.
has_body = False
if 'BODY' in props:
body = props['BODY']
if isinstance(body, str):
msg.set_content(body, cte='quoted-printable')
else:
msg.set_content(body, maintype="text", subtype="plain", cte='8bit')
has_body = True
# Plain-text is not availabe. Use the rich text version.
else:
doc.rtf_attachments += 1
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
msg.set_content(
"<no plain text message body --- see attachment {}>".format(fn),
cte='quoted-printable')
# Add a HTML body from the RTF_COMPRESSED field.
if 'RTF_COMPRESSED' in props:
# Decompress the value to Rich Text Format.
import compressed_rtf
rtf = props['RTF_COMPRESSED']
rtf = compressed_rtf.decompress(rtf)
# Add RTF file as an attachment.
msg.add_attachment(
rtf,
maintype="text", subtype="rtf",
filename=fn)
# Try rtfparse to de-encapsulate HTML stored in a rich
# text container.
try:
rtf_blob = io.BytesIO(rtf)
parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
html_stream = io.StringIO()
De_encapsulate_HTML().render(parsed, html_stream)
html_body = html_stream.getvalue()
if not has_body:
msg.set_content(html_body, subtype="html", cte='quoted-printable')
has_body = True
else:
msg.add_alternative(html_body, subtype="html", cte='quoted-printable')
# If that fails, just attach the RTF file to the message.
except:
doc.rtf_attachments += 1
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
if not has_body:
msg.set_content(
"<no plain text message body --- see attachment {}>".format(fn),
cte='quoted-printable')
has_body = True
# Add RTF file as an attachment.
msg.add_attachment(
rtf,
maintype="text", subtype="rtf",
filename=fn)
if not has_body:
msg.set_content("<no message body>", cte='quoted-printable')
# # Copy over string values of remaining properties as headers
# # so we don't lose any information.
+1
View File
@@ -1,2 +1,3 @@
compoundfiles
compressed-rtf
rtfparse # Python 3.9+ only
+3 -2
View File
@@ -3,6 +3,7 @@ import setuptools
install_requires = [
'compoundfiles',
'compressed_rtf',
'rtfparse',
]
with open("README.md", "r") as fh:
@@ -10,7 +11,7 @@ with open("README.md", "r") as fh:
setuptools.setup(
name='convert-outlook-msg-file',
version='0.1.0',
version='0.2.0',
description='Parse Microsoft Outlook MSG files',
author='Joshua Tauberer',
author_email='jt@occams.info',
@@ -19,7 +20,7 @@ setuptools.setup(
install_requires=install_requires,
long_description=long_description,
long_description_content_type="text/markdown",
python_requires='>=3.6',
python_requires='>=3.9',
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",