Use rtfparse to extract HTML message bodies from RTF containers and create mutlipart/alternative messages if both plain text and HTML are available
Also fixes #20.
This commit is contained in:
@@ -1,22 +1,24 @@
|
||||
Convert Outlook .msg Files to .eml (MIME format)
|
||||
================================================
|
||||
|
||||
This repository contains a Python 3.6 module for
|
||||
This repository contains a Python 3.9+ module for
|
||||
reading Microsoft Outlook .msg files and converting
|
||||
them to .eml format, which is the standard MIME
|
||||
format for email messages.
|
||||
|
||||
Install the dependencies with:
|
||||
|
||||
pip3.6 install -r requirements.txt
|
||||
pip install -r requirements.txt
|
||||
|
||||
(You may need to create and activate a Python virtual environment first.)
|
||||
|
||||
Then either convert a single file by piping:
|
||||
|
||||
python3.6 outlookmsgfile.py < message.msg > message.eml
|
||||
python outlookmsgfile.py < message.msg > message.eml
|
||||
|
||||
Or convert a set of files:
|
||||
|
||||
python3.6 outlookmsgfile.py *.msg
|
||||
python outlookmsgfile.py *.msg
|
||||
|
||||
When passing filenames as command-line arguments, a new file with `.eml`
|
||||
appended to the filename is written out with the message in MIME format.
|
||||
|
||||
+37
-10
@@ -16,6 +16,7 @@ import re
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import io
|
||||
|
||||
from functools import reduce
|
||||
|
||||
@@ -23,6 +24,8 @@ import email.message, email.parser, email.policy
|
||||
from email.utils import parsedate_to_datetime, formatdate, formataddr
|
||||
|
||||
import compoundfiles
|
||||
from rtfparse.parser import Rtf_Parser
|
||||
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -102,34 +105,58 @@ def load_message_stream(entry, is_top_level, doc):
|
||||
msg['Subject'] = props['SUBJECT']
|
||||
del props['SUBJECT']
|
||||
|
||||
# Add the plain-text body from the BODY field.
|
||||
# Add a plain text body from the BODY field.
|
||||
has_body = False
|
||||
if 'BODY' in props:
|
||||
body = props['BODY']
|
||||
if isinstance(body, str):
|
||||
msg.set_content(body, cte='quoted-printable')
|
||||
else:
|
||||
msg.set_content(body, maintype="text", subtype="plain", cte='8bit')
|
||||
has_body = True
|
||||
|
||||
# Plain-text is not availabe. Use the rich text version.
|
||||
else:
|
||||
doc.rtf_attachments += 1
|
||||
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
|
||||
|
||||
msg.set_content(
|
||||
"<no plain text message body --- see attachment {}>".format(fn),
|
||||
cte='quoted-printable')
|
||||
|
||||
# Add a HTML body from the RTF_COMPRESSED field.
|
||||
if 'RTF_COMPRESSED' in props:
|
||||
# Decompress the value to Rich Text Format.
|
||||
import compressed_rtf
|
||||
rtf = props['RTF_COMPRESSED']
|
||||
rtf = compressed_rtf.decompress(rtf)
|
||||
|
||||
# Try rtfparse to de-encapsulate HTML stored in a rich
|
||||
# text container.
|
||||
try:
|
||||
rtf_blob = io.BytesIO(rtf)
|
||||
parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
|
||||
html_stream = io.StringIO()
|
||||
De_encapsulate_HTML().render(parsed, html_stream)
|
||||
html_body = html_stream.getvalue()
|
||||
|
||||
if not has_body:
|
||||
msg.set_content(html_body, subtype="html", cte='quoted-printable')
|
||||
has_body = True
|
||||
else:
|
||||
msg.add_alternative(html_body, subtype="html", cte='quoted-printable')
|
||||
|
||||
# If that fails, just attach the RTF file to the message.
|
||||
except:
|
||||
doc.rtf_attachments += 1
|
||||
fn = "messagebody_{}.rtf".format(doc.rtf_attachments)
|
||||
|
||||
if not has_body:
|
||||
msg.set_content(
|
||||
"<no plain text message body --- see attachment {}>".format(fn),
|
||||
cte='quoted-printable')
|
||||
has_body = True
|
||||
|
||||
# Add RTF file as an attachment.
|
||||
msg.add_attachment(
|
||||
rtf,
|
||||
maintype="text", subtype="rtf",
|
||||
filename=fn)
|
||||
|
||||
if not has_body:
|
||||
msg.set_content("<no message body>", cte='quoted-printable')
|
||||
|
||||
# # Copy over string values of remaining properties as headers
|
||||
# # so we don't lose any information.
|
||||
# for k, v in props.items():
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
compoundfiles
|
||||
compressed-rtf
|
||||
rtfparse # Python 3.9+ only
|
||||
|
||||
@@ -3,6 +3,7 @@ import setuptools
|
||||
install_requires = [
|
||||
'compoundfiles',
|
||||
'compressed_rtf',
|
||||
'rtfparse',
|
||||
]
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
@@ -10,7 +11,7 @@ with open("README.md", "r") as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name='convert-outlook-msg-file',
|
||||
version='0.1.0',
|
||||
version='0.2.0',
|
||||
description='Parse Microsoft Outlook MSG files',
|
||||
author='Joshua Tauberer',
|
||||
author_email='jt@occams.info',
|
||||
@@ -19,7 +20,7 @@ setuptools.setup(
|
||||
install_requires=install_requires,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
python_requires='>=3.6',
|
||||
python_requires='>=3.9',
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
|
||||
Reference in New Issue
Block a user