Create pyqt_pdf_print.py
This is simple code for accessing .msg email content - allow exporting it to PDF, HTML as well as all attachments.
This commit is contained in:
205
pyqt_pdf_print.py
Normal file
205
pyqt_pdf_print.py
Normal file
@@ -0,0 +1,205 @@
|
||||
from PyQt6.QtPrintSupport import QPrinter
|
||||
from PyQt6.QtGui import QTextDocument, QImage
|
||||
from PyQt6.QtCore import QMarginsF
|
||||
from PyQt6.QtWidgets import QApplication,QMainWindow
|
||||
import extract_msg
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import base64
|
||||
|
||||
application_name = "MSG Viewer"
|
||||
replacement = str.maketrans({'<': '<', '>': '>'}) # Proper HTML escaping
|
||||
|
||||
# --- Functions ---
|
||||
def check_folder_path(folder_location):
|
||||
"""
|
||||
Ensure only a folder is created from the given path.
|
||||
If the path includes a filename, remove the file and create only the folder structure.
|
||||
"""
|
||||
try:
|
||||
if os.path.exists(folder_location):
|
||||
return f"Path {folder_location} already exists."
|
||||
# Extract directory path (remove filename if present)
|
||||
folder_path = folder_location
|
||||
if os.path.splitext(folder_location)[1]: # Checks if there's a file extension
|
||||
folder_path = os.path.dirname(folder_location) # Get only the folder path
|
||||
# Ensure the directory exists
|
||||
os.makedirs(folder_path, exist_ok=True)
|
||||
return f"Path {folder_path} now exists."
|
||||
except Exception as err:
|
||||
print(f'Error checking file path: {folder_location} - {err}')
|
||||
|
||||
def sanitize_name(name):
|
||||
"""
|
||||
Sanitize a string to keep only letters, numbers, and basic punctuation;
|
||||
replace spaces with underscores.
|
||||
"""
|
||||
try:
|
||||
if not name:
|
||||
return "Untitled"
|
||||
name = name.strip().replace(" ", "_")
|
||||
sanitized = re.sub(r'[^\w.,-]', '', name).strip('_')
|
||||
return sanitized if sanitized else "Unnamed"
|
||||
except Exception as e:
|
||||
print(f"Error while cleaning subject name: {e}")
|
||||
return "Unnamed"
|
||||
|
||||
def clean_html_colors(html_content):
|
||||
""" Convert invalid color formats to standard hex codes """
|
||||
# Convert `rgb(r, g, b)` to `#RRGGBB`
|
||||
def rgb_to_hex(match):
|
||||
r, g, b = map(int, match.groups())
|
||||
return f'#{r:02X}{g:02X}{b:02X}' # Convert to uppercase hex (e.g., #FFFFFF)
|
||||
|
||||
html_content = re.sub(r'rgb\(\s*(\d+),\s*(\d+),\s*(\d+)\s*\)', rgb_to_hex, html_content)
|
||||
|
||||
# Replace any remaining invalid colors (like #rgba) with black
|
||||
html_content = re.sub(r'#rgba\b', '#000000', html_content)
|
||||
|
||||
return html_content
|
||||
|
||||
|
||||
def extract_images(msg):
|
||||
""" Extract images from an MSG file and store them in memory (QImage) """
|
||||
image_dict = {}
|
||||
for attachment in msg.attachments:
|
||||
if attachment.longFilename.lower().endswith(('png', 'jpg', 'jpeg', 'gif')):
|
||||
cid = attachment.cid or attachment.longFilename # Content ID or filename
|
||||
image_data = attachment.data # Raw image data
|
||||
|
||||
# Convert image data to Base64 (for embedding)
|
||||
base64_data = base64.b64encode(image_data).decode('utf-8')
|
||||
|
||||
# Detect MIME type based on extension
|
||||
mime_type = "image/png" if attachment.longFilename.lower().endswith("png") else "image/jpeg"
|
||||
image_dict[cid] = f"data:{mime_type};base64,{base64_data}"
|
||||
|
||||
return image_dict
|
||||
|
||||
def replace_img_src(html, image_dict):
|
||||
"""
|
||||
Replace Content-ID references with Base64 images
|
||||
Some images will be preserved in PDF, those not refferenced by links.
|
||||
"""
|
||||
for cid, base64_data in image_dict.items():
|
||||
html = re.sub(rf'cid:{re.escape(cid)}', base64_data, html)
|
||||
return html
|
||||
|
||||
def export_to_pdf(content, pdf_path):
|
||||
try:
|
||||
doc = QTextDocument()
|
||||
# Basic sanitization: Remove invalid #rgba-like patterns
|
||||
if "<" in content and ">" in content:
|
||||
doc.setHtml(content)
|
||||
else:
|
||||
doc.setPlainText(content)
|
||||
|
||||
printer = QPrinter()
|
||||
printer.setOutputFormat(QPrinter.OutputFormat.PdfFormat)
|
||||
printer.setOutputFileName(pdf_path)
|
||||
printer.setPageMargins(QMarginsF(10, 10, 10, 10))
|
||||
doc.print(printer)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error in export_to_pdf: {e}")
|
||||
return False
|
||||
|
||||
source_file = "emails/The latest Infinigate Cloud news and highlights.msg"
|
||||
pdf_path = "tests/out_pdf.pdf"
|
||||
|
||||
|
||||
## Accessing emails:
|
||||
def get_msg_content(msg_file):
|
||||
"""
|
||||
By passing .msg file you will get 3 values:
|
||||
1. Is the content of body as string - in html format if the message was html
|
||||
2. list of attachment names, if any available ( this should ignore embeded images)
|
||||
3. Dictionary of all attachments content - accessed by using attachment name
|
||||
To get the attachment you can use:
|
||||
content = get_msg_content(msg_file)
|
||||
content[2][content[1][0]]
|
||||
Or if you know the name:
|
||||
content[2]['attachment name.pdf']
|
||||
"""
|
||||
msg = extract_msg.Message(msg_file)
|
||||
line_break = "<br>" if msg.htmlBody else "\n"
|
||||
content = msg.htmlBody if msg.htmlBody else msg.body
|
||||
|
||||
# Extract images and replace CID references
|
||||
image_dict = extract_images(msg)
|
||||
if msg.htmlBody:
|
||||
content = replace_img_src(msg.htmlBody.decode('utf-8', errors='ignore'), image_dict)
|
||||
# Get sender, recipients and prepare them to be passed to use
|
||||
sender = msg.sender or "Unknown Sender"
|
||||
sender = sender.translate(replacement)
|
||||
to = ", ".join(recipient.formatted for recipient in msg.recipients) or "Unknown Recipient"
|
||||
to = to.translate(replacement)
|
||||
|
||||
metadata = (
|
||||
f"From: {sender}{line_break}"
|
||||
f"To: {to}{line_break}"
|
||||
f"Subject: {msg.subject}{line_break}"
|
||||
)
|
||||
|
||||
cc = ", ".join(recipient.email for recipient in msg.recipients if recipient.type == "cc")
|
||||
if cc:
|
||||
metadata += f"CC: {cc}{line_break}"
|
||||
bcc = ", ".join(recipient.email for recipient in msg.recipients if recipient.type == "bcc")
|
||||
if bcc:
|
||||
metadata += f"BCC: {bcc}{line_break}"
|
||||
date_str = msg.date.strftime("%d. %b %Y %H:%M")
|
||||
metadata += f"Date: {date_str}{line_break}{'_' * 65}{line_break}"
|
||||
|
||||
# Attachments:
|
||||
attachment_info, attachments_list, attachments_sumary = {}, "", []
|
||||
saved_count = 0
|
||||
|
||||
for attachment in msg.attachments:
|
||||
if not attachment.contentId:
|
||||
attach_name = attachment.longFilename or attachment.shortFilename or f"Unnamed_Attachment_{saved_count}"
|
||||
sanitized_attach_name = sanitize_name(attach_name)
|
||||
attachment_info[sanitized_attach_name] = (attachment, attachment.data)
|
||||
saved_count += 1
|
||||
if saved_count > 0:
|
||||
attachments_list = f"In this email were {saved_count} attachments:{line_break}"
|
||||
for attach_name in attachment_info.keys():
|
||||
attachments_list += f"- {attach_name}{line_break}"
|
||||
attachments_sumary.append(attach_name)
|
||||
attachments_list += f"{'_' * 65}{line_break}"
|
||||
content = metadata + attachments_list + line_break + clean_html_colors(str(content))
|
||||
return content, attachments_sumary, attachment_info
|
||||
else:
|
||||
content = metadata + line_break + clean_html_colors(str(content))
|
||||
return content, [], {}
|
||||
|
||||
|
||||
def msg_to_pdf(msg_file, pdf_path=None):
|
||||
msg = extract_msg.Message(msg_file)
|
||||
content = get_msg_content(msg_file)
|
||||
|
||||
if pdf_path == None:
|
||||
pdf_path = f'exported-msg/msg-{sanitize_name(msg.subject)}.pdf'
|
||||
# example how to get an attachment: content[2][content[1][0]]
|
||||
|
||||
check_folder_path(pdf_path)
|
||||
# Saves message in .HTML format as the PyQt does not handle links on its own
|
||||
with open(pdf_path.replace('.pdf', '.html'),"w") as f:
|
||||
f.write(content[0])
|
||||
# Saves mesage in .PDF for general use (missing links and web images)
|
||||
export_to_pdf(content[0], pdf_path)
|
||||
print("Exported")
|
||||
|
||||
# Subclass QMainWindow to customize your application's main window
|
||||
class MainWindow(QMainWindow):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
#content_msg = get_msg_content(source_file)
|
||||
#print(get_msg_content(source_file))
|
||||
msg_to_pdf(source_file,pdf_path)
|
||||
sys.exit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = QApplication(sys.argv)
|
||||
main_window = MainWindow()
|
||||
sys.exit(app.exec())
|
||||
Reference in New Issue
Block a user