From cbcd61bf3342883c51e43e6bbe1ee33cd4e0638f Mon Sep 17 00:00:00 2001 From: ghostersk <68815071+ghostersk@users.noreply.github.com> Date: Sun, 16 Mar 2025 21:02:32 +0000 Subject: [PATCH] Create pyqt_pdf_print.py This is simple code for accessing .msg email content - allow exporting it to PDF, HTML as well as all attachments. --- pyqt_pdf_print.py | 205 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 pyqt_pdf_print.py diff --git a/pyqt_pdf_print.py b/pyqt_pdf_print.py new file mode 100644 index 0000000..e1c2f66 --- /dev/null +++ b/pyqt_pdf_print.py @@ -0,0 +1,205 @@ +from PyQt6.QtPrintSupport import QPrinter +from PyQt6.QtGui import QTextDocument, QImage +from PyQt6.QtCore import QMarginsF +from PyQt6.QtWidgets import QApplication,QMainWindow +import extract_msg +import re +import sys +import os +import base64 + +application_name = "MSG Viewer" +replacement = str.maketrans({'<': '<', '>': '>'}) # Proper HTML escaping + +# --- Functions --- +def check_folder_path(folder_location): + """ + Ensure only a folder is created from the given path. + If the path includes a filename, remove the file and create only the folder structure. + """ + try: + if os.path.exists(folder_location): + return f"Path {folder_location} already exists." + # Extract directory path (remove filename if present) + folder_path = folder_location + if os.path.splitext(folder_location)[1]: # Checks if there's a file extension + folder_path = os.path.dirname(folder_location) # Get only the folder path + # Ensure the directory exists + os.makedirs(folder_path, exist_ok=True) + return f"Path {folder_path} now exists." + except Exception as err: + print(f'Error checking file path: {folder_location} - {err}') + +def sanitize_name(name): + """ + Sanitize a string to keep only letters, numbers, and basic punctuation; + replace spaces with underscores. + """ + try: + if not name: + return "Untitled" + name = name.strip().replace(" ", "_") + sanitized = re.sub(r'[^\w.,-]', '', name).strip('_') + return sanitized if sanitized else "Unnamed" + except Exception as e: + print(f"Error while cleaning subject name: {e}") + return "Unnamed" + +def clean_html_colors(html_content): + """ Convert invalid color formats to standard hex codes """ + # Convert `rgb(r, g, b)` to `#RRGGBB` + def rgb_to_hex(match): + r, g, b = map(int, match.groups()) + return f'#{r:02X}{g:02X}{b:02X}' # Convert to uppercase hex (e.g., #FFFFFF) + + html_content = re.sub(r'rgb\(\s*(\d+),\s*(\d+),\s*(\d+)\s*\)', rgb_to_hex, html_content) + + # Replace any remaining invalid colors (like #rgba) with black + html_content = re.sub(r'#rgba\b', '#000000', html_content) + + return html_content + + +def extract_images(msg): + """ Extract images from an MSG file and store them in memory (QImage) """ + image_dict = {} + for attachment in msg.attachments: + if attachment.longFilename.lower().endswith(('png', 'jpg', 'jpeg', 'gif')): + cid = attachment.cid or attachment.longFilename # Content ID or filename + image_data = attachment.data # Raw image data + + # Convert image data to Base64 (for embedding) + base64_data = base64.b64encode(image_data).decode('utf-8') + + # Detect MIME type based on extension + mime_type = "image/png" if attachment.longFilename.lower().endswith("png") else "image/jpeg" + image_dict[cid] = f"data:{mime_type};base64,{base64_data}" + + return image_dict + +def replace_img_src(html, image_dict): + """ + Replace Content-ID references with Base64 images + Some images will be preserved in PDF, those not refferenced by links. + """ + for cid, base64_data in image_dict.items(): + html = re.sub(rf'cid:{re.escape(cid)}', base64_data, html) + return html + +def export_to_pdf(content, pdf_path): + try: + doc = QTextDocument() + # Basic sanitization: Remove invalid #rgba-like patterns + if "<" in content and ">" in content: + doc.setHtml(content) + else: + doc.setPlainText(content) + + printer = QPrinter() + printer.setOutputFormat(QPrinter.OutputFormat.PdfFormat) + printer.setOutputFileName(pdf_path) + printer.setPageMargins(QMarginsF(10, 10, 10, 10)) + doc.print(printer) + return True + except Exception as e: + print(f"Error in export_to_pdf: {e}") + return False + +source_file = "emails/The latest Infinigate Cloud news and highlights.msg" +pdf_path = "tests/out_pdf.pdf" + + +## Accessing emails: +def get_msg_content(msg_file): + """ + By passing .msg file you will get 3 values: + 1. Is the content of body as string - in html format if the message was html + 2. list of attachment names, if any available ( this should ignore embeded images) + 3. Dictionary of all attachments content - accessed by using attachment name + To get the attachment you can use: + content = get_msg_content(msg_file) + content[2][content[1][0]] + Or if you know the name: + content[2]['attachment name.pdf'] + """ + msg = extract_msg.Message(msg_file) + line_break = "
" if msg.htmlBody else "\n" + content = msg.htmlBody if msg.htmlBody else msg.body + + # Extract images and replace CID references + image_dict = extract_images(msg) + if msg.htmlBody: + content = replace_img_src(msg.htmlBody.decode('utf-8', errors='ignore'), image_dict) + # Get sender, recipients and prepare them to be passed to use + sender = msg.sender or "Unknown Sender" + sender = sender.translate(replacement) + to = ", ".join(recipient.formatted for recipient in msg.recipients) or "Unknown Recipient" + to = to.translate(replacement) + + metadata = ( + f"From: {sender}{line_break}" + f"To: {to}{line_break}" + f"Subject: {msg.subject}{line_break}" + ) + + cc = ", ".join(recipient.email for recipient in msg.recipients if recipient.type == "cc") + if cc: + metadata += f"CC: {cc}{line_break}" + bcc = ", ".join(recipient.email for recipient in msg.recipients if recipient.type == "bcc") + if bcc: + metadata += f"BCC: {bcc}{line_break}" + date_str = msg.date.strftime("%d. %b %Y %H:%M") + metadata += f"Date: {date_str}{line_break}{'_' * 65}{line_break}" + + # Attachments: + attachment_info, attachments_list, attachments_sumary = {}, "", [] + saved_count = 0 + + for attachment in msg.attachments: + if not attachment.contentId: + attach_name = attachment.longFilename or attachment.shortFilename or f"Unnamed_Attachment_{saved_count}" + sanitized_attach_name = sanitize_name(attach_name) + attachment_info[sanitized_attach_name] = (attachment, attachment.data) + saved_count += 1 + if saved_count > 0: + attachments_list = f"In this email were {saved_count} attachments:{line_break}" + for attach_name in attachment_info.keys(): + attachments_list += f"- {attach_name}{line_break}" + attachments_sumary.append(attach_name) + attachments_list += f"{'_' * 65}{line_break}" + content = metadata + attachments_list + line_break + clean_html_colors(str(content)) + return content, attachments_sumary, attachment_info + else: + content = metadata + line_break + clean_html_colors(str(content)) + return content, [], {} + + +def msg_to_pdf(msg_file, pdf_path=None): + msg = extract_msg.Message(msg_file) + content = get_msg_content(msg_file) + + if pdf_path == None: + pdf_path = f'exported-msg/msg-{sanitize_name(msg.subject)}.pdf' + # example how to get an attachment: content[2][content[1][0]] + + check_folder_path(pdf_path) + # Saves message in .HTML format as the PyQt does not handle links on its own + with open(pdf_path.replace('.pdf', '.html'),"w") as f: + f.write(content[0]) + # Saves mesage in .PDF for general use (missing links and web images) + export_to_pdf(content[0], pdf_path) + print("Exported") + +# Subclass QMainWindow to customize your application's main window +class MainWindow(QMainWindow): + def __init__(self): + super().__init__() + #content_msg = get_msg_content(source_file) + #print(get_msg_content(source_file)) + msg_to_pdf(source_file,pdf_path) + sys.exit() + +if __name__ == "__main__": + app = QApplication(sys.argv) + main_window = MainWindow() + sys.exit(app.exec())