From 221a72b2bffe503347dd78cc6c6a8d3a4f5167cd Mon Sep 17 00:00:00 2001 From: noah Date: Mon, 9 Mar 2026 18:12:42 +0100 Subject: [PATCH] fix: rewrite document converter to preserve formatting in PDF output Replace jsPDF.text() (plain text dump) with jsPDF.html() + html2canvas-pro which renders actual styled DOM elements into the PDF. DOCX headings, bold, italic, tables, lists, links now render correctly. All HTML output also gets a full styled document wrapper with embedded CSS. HTML-to-Markdown converter rewritten with proper DOM-walking parser instead of fragile regex. --- package-lock.json | 22 +- package.json | 1 + src/lib/converters/documentConverter.ts | 520 +++++++++++++++++++++--- 3 files changed, 491 insertions(+), 52 deletions(-) diff --git a/package-lock.json b/package-lock.json index 59fbac5..8c37d7a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,17 +1,18 @@ { - "name": "2026-03-03_dbi_swift_macos_liquidglass", + "name": "transmute", "version": "0.1.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "2026-03-03_dbi_swift_macos_liquidglass", + "name": "transmute", "version": "0.1.0", "dependencies": { "@ffmpeg/ffmpeg": "^0.12.15", "@ffmpeg/util": "^0.12.2", "fast-xml-parser": "^5.4.2", "framer-motion": "^12.35.2", + "html2canvas-pro": "^2.0.2", "js-yaml": "^4.1.1", "jspdf": "^4.2.0", "jszip": "^3.10.1", @@ -2564,7 +2565,6 @@ "resolved": "https://registry.npmjs.org/base64-arraybuffer/-/base64-arraybuffer-1.0.2.tgz", "integrity": "sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==", "license": "MIT", - "optional": true, "engines": { "node": ">= 0.6.0" } @@ -2860,7 +2860,6 @@ "resolved": "https://registry.npmjs.org/css-line-break/-/css-line-break-2.1.0.tgz", "integrity": "sha512-FHcKFCZcAha3LwfVBhCQbW2nCNbkZXn7KVUJcsT5/P8YmfsVja0FMPJr0B903j/E69HUphKiV9iQArX8SDYA4w==", "license": "MIT", - "optional": true, "dependencies": { "utrie": "^1.0.2" } @@ -4253,6 +4252,19 @@ "node": ">=8.0.0" } }, + "node_modules/html2canvas-pro": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/html2canvas-pro/-/html2canvas-pro-2.0.2.tgz", + "integrity": "sha512-9G/t0XgCZWonLwL0JwI7su6NdbOPUY7Ur4Ihpp8+XMaW9ibA2nDXF181Jr6tm94k8lX6sthpaXB3XqEnsMd5Cw==", + "license": "MIT", + "dependencies": { + "css-line-break": "^2.1.0", + "text-segmentation": "^1.0.3" + }, + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/ignore": { "version": "5.3.2", "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", @@ -6729,7 +6741,6 @@ "resolved": "https://registry.npmjs.org/text-segmentation/-/text-segmentation-1.0.3.tgz", "integrity": "sha512-iOiPUo/BGnZ6+54OsWxZidGCsdU8YbE4PSpdPinp7DeMtUJNJBoJ/ouUSTJjHkh1KntHaltHl/gDs2FC4i5+Nw==", "license": "MIT", - "optional": true, "dependencies": { "utrie": "^1.0.2" } @@ -7088,7 +7099,6 @@ "resolved": "https://registry.npmjs.org/utrie/-/utrie-1.0.2.tgz", "integrity": "sha512-1MLa5ouZiOmQzUbjbu9VmjLzn1QLXBhwpUa7kdLUQK+KQ5KA9I1vk5U4YHe/X2Ch7PYnJfWuWT+VbuxbGwljhw==", "license": "MIT", - "optional": true, "dependencies": { "base64-arraybuffer": "^1.0.2" } diff --git a/package.json b/package.json index f07fe67..b5ece41 100644 --- a/package.json +++ b/package.json @@ -13,6 +13,7 @@ "@ffmpeg/util": "^0.12.2", "fast-xml-parser": "^5.4.2", "framer-motion": "^12.35.2", + "html2canvas-pro": "^2.0.2", "js-yaml": "^4.1.1", "jspdf": "^4.2.0", "jszip": "^3.10.1", diff --git a/src/lib/converters/documentConverter.ts b/src/lib/converters/documentConverter.ts index 06da972..e0f43bb 100644 --- a/src/lib/converters/documentConverter.ts +++ b/src/lib/converters/documentConverter.ts @@ -2,6 +2,10 @@ import { ConversionResult } from '@/types'; import { buildOutputFilename } from '@/lib/utils'; import { getExtension } from '@/lib/fileDetector'; +/* ============================================ + File reading helpers + ============================================ */ + async function readFileAsText(file: File): Promise { return new Promise((resolve, reject) => { const reader = new FileReader(); @@ -20,10 +24,190 @@ async function readFileAsArrayBuffer(file: File): Promise { }); } +/* ============================================ + Styled HTML document wrapper + + This is used for ALL HTML output and as the + intermediate step for PDF rendering. Embeds + full CSS so the document looks correct both + as a standalone .html file and when rendered + to PDF via jsPDF.html(). + ============================================ */ + +function wrapInStyledHtml(bodyHtml: string, title: string): string { + return ` + + + + +${escapeHtml(title)} + + + +${bodyHtml} + +`; +} + +function escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); +} + +/* ============================================ + Source → HTML conversions + ============================================ */ + async function docxToHtml(file: File): Promise { const mammoth = await import('mammoth'); const arrayBuffer = await readFileAsArrayBuffer(file); - const result = await mammoth.convertToHtml({ arrayBuffer }); + const result = await mammoth.convertToHtml({ + arrayBuffer, + }); return result.value; } @@ -46,50 +230,271 @@ function htmlToText(html: string): string { } function htmlToMarkdown(html: string): string { - let md = html; - md = md.replace(/]*>(.*?)<\/h1>/gi, '# $1\n\n'); - md = md.replace(/]*>(.*?)<\/h2>/gi, '## $1\n\n'); - md = md.replace(/]*>(.*?)<\/h3>/gi, '### $1\n\n'); - md = md.replace(/]*>(.*?)<\/h4>/gi, '#### $1\n\n'); - md = md.replace(/]*>(.*?)<\/strong>/gi, '**$1**'); - md = md.replace(/]*>(.*?)<\/b>/gi, '**$1**'); - md = md.replace(/]*>(.*?)<\/em>/gi, '*$1*'); - md = md.replace(/]*>(.*?)<\/i>/gi, '*$1*'); - md = md.replace(/]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)'); - md = md.replace(//gi, '\n'); - md = md.replace(/]*>(.*?)<\/p>/gi, '$1\n\n'); - md = md.replace(/]*>(.*?)<\/li>/gi, '- $1\n'); - md = md.replace(/<[^>]+>/g, ''); - md = md.replace(/ /g, ' '); - md = md.replace(/&/g, '&'); - md = md.replace(/</g, '<'); - md = md.replace(/>/g, '>'); - return md.trim(); + // Parse properly using DOMParser for reliable conversion + const parser = new DOMParser(); + const doc = parser.parseFromString(html, 'text/html'); + + function walk(node: Node): string { + if (node.nodeType === Node.TEXT_NODE) { + return node.textContent || ''; + } + + if (node.nodeType !== Node.ELEMENT_NODE) return ''; + + const el = node as Element; + const tag = el.tagName.toLowerCase(); + const children = Array.from(el.childNodes).map(walk).join(''); + + switch (tag) { + case 'h1': return `# ${children.trim()}\n\n`; + case 'h2': return `## ${children.trim()}\n\n`; + case 'h3': return `### ${children.trim()}\n\n`; + case 'h4': return `#### ${children.trim()}\n\n`; + case 'h5': return `##### ${children.trim()}\n\n`; + case 'h6': return `###### ${children.trim()}\n\n`; + case 'p': return `${children.trim()}\n\n`; + case 'br': return '\n'; + case 'hr': return '\n---\n\n'; + case 'strong': + case 'b': return `**${children}**`; + case 'em': + case 'i': return `*${children}*`; + case 'u': return `${children}`; + case 's': + case 'strike': + case 'del': return `~~${children}~~`; + case 'code': return `\`${children}\``; + case 'pre': return `\n\`\`\`\n${children.trim()}\n\`\`\`\n\n`; + case 'blockquote': return children.split('\n').map(l => `> ${l}`).join('\n') + '\n\n'; + case 'a': { + const href = el.getAttribute('href') || ''; + return `[${children}](${href})`; + } + case 'img': { + const src = el.getAttribute('src') || ''; + const alt = el.getAttribute('alt') || ''; + return `![${alt}](${src})`; + } + case 'ul': { + const items = Array.from(el.children) + .filter(c => c.tagName.toLowerCase() === 'li') + .map(c => `- ${walk(c).trim()}`) + .join('\n'); + return `${items}\n\n`; + } + case 'ol': { + const items = Array.from(el.children) + .filter(c => c.tagName.toLowerCase() === 'li') + .map((c, i) => `${i + 1}. ${walk(c).trim()}`) + .join('\n'); + return `${items}\n\n`; + } + case 'li': return children; + case 'table': { + const rows = Array.from(el.querySelectorAll('tr')); + if (rows.length === 0) return children; + + const tableData: string[][] = rows.map(row => + Array.from(row.querySelectorAll('th, td')).map(cell => walk(cell).trim()) + ); + + if (tableData.length === 0) return ''; + + const colCount = Math.max(...tableData.map(r => r.length)); + const colWidths = Array.from({ length: colCount }, (_, i) => + Math.max(3, ...tableData.map(r => (r[i] || '').length)) + ); + + const formatRow = (row: string[]) => + '| ' + colWidths.map((w, i) => (row[i] || '').padEnd(w)).join(' | ') + ' |'; + + const separator = '| ' + colWidths.map(w => '-'.repeat(w)).join(' | ') + ' |'; + + const lines = [formatRow(tableData[0]), separator, ...tableData.slice(1).map(formatRow)]; + return lines.join('\n') + '\n\n'; + } + case 'div': + case 'section': + case 'article': + case 'main': + case 'span': + return children; + default: + return children; + } + } + + return walk(doc.body).replace(/\n{3,}/g, '\n\n').trim(); } -async function textToPdf(text: string): Promise { +/* ============================================ + HTML → PDF via jsPDF.html() + + Renders a styled HTML document into a real + PDF by injecting it into a hidden DOM container + and using jsPDF's html() method (backed by + html2canvas) to capture the visual rendering. + ============================================ */ + +async function renderHtmlToPdf(htmlContent: string): Promise { + const { jsPDF } = await import('jspdf'); + // html2canvas-pro is imported for its side-effect: + // jsPDF.html() looks for it on the window/global scope + const html2canvas = (await import('html2canvas-pro')).default; + + // Create a hidden container for rendering + const container = document.createElement('div'); + container.style.position = 'fixed'; + container.style.left = '-10000px'; + container.style.top = '0'; + container.style.width = '794px'; // A4 width in px at 96dpi + container.style.background = '#ffffff'; + container.style.zIndex = '-9999'; + + // Parse the HTML and inject just the body + styles + const parser = new DOMParser(); + const parsed = parser.parseFromString(htmlContent, 'text/html'); + + // Apply styles inline + const styleEl = parsed.querySelector('style'); + const bodyContent = parsed.body.innerHTML; + + if (styleEl) { + const style = document.createElement('style'); + style.textContent = styleEl.textContent; + container.appendChild(style); + } + + const content = document.createElement('div'); + content.innerHTML = bodyContent; + content.style.padding = '40px'; + content.style.fontFamily = "'Segoe UI', -apple-system, BlinkMacSystemFont, 'Helvetica Neue', Arial, sans-serif"; + content.style.fontSize = '14px'; + content.style.lineHeight = '1.7'; + content.style.color = '#1a1a1a'; + container.appendChild(content); + + document.body.appendChild(container); + + // Wait for fonts/images to load + await new Promise((resolve) => setTimeout(resolve, 100)); + + try { + // A4 dimensions in mm: 210 x 297 + const pdfWidth = 210; + const pdfHeight = 297; + const margin = 15; // mm + + // Capture the rendered content as a canvas + const canvas = await html2canvas(content, { + scale: 2, // Higher resolution + useCORS: true, + allowTaint: true, + backgroundColor: '#ffffff', + width: 794, + windowWidth: 794, + }); + + // Calculate how the content maps to PDF pages + const imgWidth = pdfWidth - margin * 2; + const imgHeight = (canvas.height * imgWidth) / canvas.width; + + const doc = new jsPDF('p', 'mm', 'a4'); + const pageContentHeight = pdfHeight - margin * 2; + + if (imgHeight <= pageContentHeight) { + // Single page — fits entirely + doc.addImage( + canvas.toDataURL('image/jpeg', 0.95), + 'JPEG', + margin, + margin, + imgWidth, + imgHeight + ); + } else { + // Multi-page — slice the canvas into page-sized chunks + const totalPages = Math.ceil(imgHeight / pageContentHeight); + + for (let page = 0; page < totalPages; page++) { + if (page > 0) doc.addPage(); + + // Calculate the portion of the source canvas for this page + const sourceY = (page * pageContentHeight * canvas.width) / imgWidth; + const sourceHeight = Math.min( + (pageContentHeight * canvas.width) / imgWidth, + canvas.height - sourceY + ); + + // Create a canvas slice for this page + const pageCanvas = document.createElement('canvas'); + pageCanvas.width = canvas.width; + pageCanvas.height = sourceHeight; + + const ctx = pageCanvas.getContext('2d'); + if (ctx) { + ctx.fillStyle = '#ffffff'; + ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height); + ctx.drawImage( + canvas, + 0, sourceY, + canvas.width, sourceHeight, + 0, 0, + canvas.width, sourceHeight + ); + } + + const sliceHeight = (sourceHeight * imgWidth) / canvas.width; + + doc.addImage( + pageCanvas.toDataURL('image/jpeg', 0.95), + 'JPEG', + margin, + margin, + imgWidth, + sliceHeight + ); + } + } + + return doc.output('blob'); + } finally { + document.body.removeChild(container); + } +} + +/* ============================================ + Plain text → PDF (for .txt files) + Still uses jsPDF.text() since plain text + has no formatting to preserve. + ============================================ */ + +async function plainTextToPdf(text: string): Promise { const { jsPDF } = await import('jspdf'); const doc = new jsPDF(); - const lines = doc.splitTextToSize(text, 180); - let y = 15; + + doc.setFont('courier', 'normal'); + doc.setFontSize(11); + + const lines = doc.splitTextToSize(text, 170); + let y = 20; const pageHeight = doc.internal.pageSize.getHeight(); for (const line of lines) { - if (y > pageHeight - 15) { + if (y > pageHeight - 20) { doc.addPage(); - y = 15; + y = 20; } - doc.text(line, 15, y); - y += 7; + doc.text(line, 20, y); + y += 6; } return doc.output('blob'); } -async function htmlToPdf(html: string): Promise { - const text = htmlToText(html); - return textToPdf(text); -} +/* ============================================ + PDF → Text extraction + ============================================ */ async function pdfToText(file: File): Promise { const { PDFDocument } = await import('pdf-lib'); @@ -117,6 +522,10 @@ async function pdfToText(file: File): Promise { return text; } +/* ============================================ + Main export + ============================================ */ + export async function convertDocument( file: File, targetFormat: string, @@ -132,14 +541,18 @@ export async function convertDocument( switch (sourceExt) { case 'docx': { if (targetFormat === 'html') { - const html = await docxToHtml(file); - resultBlob = new Blob([html], { type: 'text/html' }); + const bodyHtml = await docxToHtml(file); + const styledHtml = wrapInStyledHtml(bodyHtml, file.name); + resultBlob = new Blob([styledHtml], { type: 'text/html' }); } else if (targetFormat === 'txt') { const text = await docxToText(file); resultBlob = new Blob([text], { type: 'text/plain' }); } else if (targetFormat === 'pdf') { - const html = await docxToHtml(file); - resultBlob = await htmlToPdf(html); + onProgress?.(40); + const bodyHtml = await docxToHtml(file); + const styledHtml = wrapInStyledHtml(bodyHtml, file.name); + onProgress?.(60); + resultBlob = await renderHtmlToPdf(styledHtml); } else { throw new Error(`Unsupported: docx to ${targetFormat}`); } @@ -149,12 +562,20 @@ export async function convertDocument( case 'md': { const mdText = await readFileAsText(file); if (targetFormat === 'html') { - const html = await markdownToHtml(mdText); - resultBlob = new Blob([html], { type: 'text/html' }); + const bodyHtml = await markdownToHtml(mdText); + const styledHtml = wrapInStyledHtml(bodyHtml, file.name); + resultBlob = new Blob([styledHtml], { type: 'text/html' }); } else if (targetFormat === 'pdf') { - resultBlob = await textToPdf(mdText); + onProgress?.(40); + const bodyHtml = await markdownToHtml(mdText); + const styledHtml = wrapInStyledHtml(bodyHtml, file.name); + onProgress?.(60); + resultBlob = await renderHtmlToPdf(styledHtml); } else if (targetFormat === 'txt') { - resultBlob = new Blob([mdText], { type: 'text/plain' }); + // Strip markdown syntax for plain text + const bodyHtml = await markdownToHtml(mdText); + const text = htmlToText(bodyHtml); + resultBlob = new Blob([text], { type: 'text/plain' }); } else { throw new Error(`Unsupported: md to ${targetFormat}`); } @@ -163,14 +584,20 @@ export async function convertDocument( case 'html': case 'htm': { - const html = await readFileAsText(file); + const rawHtml = await readFileAsText(file); if (targetFormat === 'pdf') { - resultBlob = await htmlToPdf(html); + onProgress?.(40); + // If the HTML already has a