fix: rewrite document converter to preserve formatting in PDF output

Replace jsPDF.text() (plain text dump) with jsPDF.html() + html2canvas-pro
which renders actual styled DOM elements into the PDF. DOCX headings, bold,
italic, tables, lists, links now render correctly. All HTML output also gets
a full styled document wrapper with embedded CSS. HTML-to-Markdown converter
rewritten with proper DOM-walking parser instead of fragile regex.
This commit is contained in:
noah
2026-03-09 18:12:42 +01:00
parent 7659136045
commit 221a72b2bf
3 changed files with 491 additions and 52 deletions
+16 -6
View File
@@ -1,17 +1,18 @@
{
"name": "2026-03-03_dbi_swift_macos_liquidglass",
"name": "transmute",
"version": "0.1.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "2026-03-03_dbi_swift_macos_liquidglass",
"name": "transmute",
"version": "0.1.0",
"dependencies": {
"@ffmpeg/ffmpeg": "^0.12.15",
"@ffmpeg/util": "^0.12.2",
"fast-xml-parser": "^5.4.2",
"framer-motion": "^12.35.2",
"html2canvas-pro": "^2.0.2",
"js-yaml": "^4.1.1",
"jspdf": "^4.2.0",
"jszip": "^3.10.1",
@@ -2564,7 +2565,6 @@
"resolved": "https://registry.npmjs.org/base64-arraybuffer/-/base64-arraybuffer-1.0.2.tgz",
"integrity": "sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==",
"license": "MIT",
"optional": true,
"engines": {
"node": ">= 0.6.0"
}
@@ -2860,7 +2860,6 @@
"resolved": "https://registry.npmjs.org/css-line-break/-/css-line-break-2.1.0.tgz",
"integrity": "sha512-FHcKFCZcAha3LwfVBhCQbW2nCNbkZXn7KVUJcsT5/P8YmfsVja0FMPJr0B903j/E69HUphKiV9iQArX8SDYA4w==",
"license": "MIT",
"optional": true,
"dependencies": {
"utrie": "^1.0.2"
}
@@ -4253,6 +4252,19 @@
"node": ">=8.0.0"
}
},
"node_modules/html2canvas-pro": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/html2canvas-pro/-/html2canvas-pro-2.0.2.tgz",
"integrity": "sha512-9G/t0XgCZWonLwL0JwI7su6NdbOPUY7Ur4Ihpp8+XMaW9ibA2nDXF181Jr6tm94k8lX6sthpaXB3XqEnsMd5Cw==",
"license": "MIT",
"dependencies": {
"css-line-break": "^2.1.0",
"text-segmentation": "^1.0.3"
},
"engines": {
"node": ">=16.0.0"
}
},
"node_modules/ignore": {
"version": "5.3.2",
"resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
@@ -6729,7 +6741,6 @@
"resolved": "https://registry.npmjs.org/text-segmentation/-/text-segmentation-1.0.3.tgz",
"integrity": "sha512-iOiPUo/BGnZ6+54OsWxZidGCsdU8YbE4PSpdPinp7DeMtUJNJBoJ/ouUSTJjHkh1KntHaltHl/gDs2FC4i5+Nw==",
"license": "MIT",
"optional": true,
"dependencies": {
"utrie": "^1.0.2"
}
@@ -7088,7 +7099,6 @@
"resolved": "https://registry.npmjs.org/utrie/-/utrie-1.0.2.tgz",
"integrity": "sha512-1MLa5ouZiOmQzUbjbu9VmjLzn1QLXBhwpUa7kdLUQK+KQ5KA9I1vk5U4YHe/X2Ch7PYnJfWuWT+VbuxbGwljhw==",
"license": "MIT",
"optional": true,
"dependencies": {
"base64-arraybuffer": "^1.0.2"
}
+1
View File
@@ -13,6 +13,7 @@
"@ffmpeg/util": "^0.12.2",
"fast-xml-parser": "^5.4.2",
"framer-motion": "^12.35.2",
"html2canvas-pro": "^2.0.2",
"js-yaml": "^4.1.1",
"jspdf": "^4.2.0",
"jszip": "^3.10.1",
+474 -46
View File
@@ -2,6 +2,10 @@ import { ConversionResult } from '@/types';
import { buildOutputFilename } from '@/lib/utils';
import { getExtension } from '@/lib/fileDetector';
/* ============================================
File reading helpers
============================================ */
async function readFileAsText(file: File): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
@@ -20,10 +24,190 @@ async function readFileAsArrayBuffer(file: File): Promise<ArrayBuffer> {
});
}
/* ============================================
Styled HTML document wrapper
This is used for ALL HTML output and as the
intermediate step for PDF rendering. Embeds
full CSS so the document looks correct both
as a standalone .html file and when rendered
to PDF via jsPDF.html().
============================================ */
function wrapInStyledHtml(bodyHtml: string, title: string): string {
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>${escapeHtml(title)}</title>
<style>
/* Reset */
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, 'Helvetica Neue', Arial, sans-serif;
font-size: 14px;
line-height: 1.7;
color: #1a1a1a;
background: #ffffff;
padding: 40px;
max-width: 800px;
margin: 0 auto;
}
/* Headings */
h1, h2, h3, h4, h5, h6 {
margin-top: 1.4em;
margin-bottom: 0.6em;
font-weight: 700;
line-height: 1.3;
color: #111111;
}
h1 { font-size: 2em; border-bottom: 2px solid #e5e5e5; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #eeeeee; padding-bottom: 0.25em; }
h3 { font-size: 1.25em; }
h4 { font-size: 1.1em; }
h5, h6 { font-size: 1em; color: #555555; }
/* Paragraphs & inline */
p { margin-bottom: 1em; }
strong, b { font-weight: 700; }
em, i { font-style: italic; }
u { text-decoration: underline; }
s, strike, del { text-decoration: line-through; color: #888; }
small { font-size: 0.85em; }
sup { vertical-align: super; font-size: 0.75em; }
sub { vertical-align: sub; font-size: 0.75em; }
mark { background: #fff3b0; padding: 0.1em 0.2em; border-radius: 2px; }
abbr { text-decoration: underline dotted; cursor: help; }
/* Links */
a { color: #0066cc; text-decoration: underline; }
a:hover { color: #004499; }
/* Lists */
ul, ol { margin-bottom: 1em; padding-left: 2em; }
ul ul, ol ol, ul ol, ol ul { margin-bottom: 0; }
li { margin-bottom: 0.3em; }
li > p { margin-bottom: 0.3em; }
/* Blockquote */
blockquote {
margin: 1em 0;
padding: 0.8em 1.2em;
border-left: 4px solid #0066cc;
background: #f6f8fa;
color: #333;
font-style: italic;
}
blockquote p:last-child { margin-bottom: 0; }
/* Code */
code {
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
font-size: 0.9em;
background: #f0f0f0;
padding: 0.15em 0.4em;
border-radius: 3px;
color: #c7254e;
}
pre {
margin: 1em 0;
padding: 1em;
background: #f6f8fa;
border: 1px solid #e1e4e8;
border-radius: 6px;
overflow-x: auto;
font-size: 0.9em;
line-height: 1.5;
}
pre code {
background: none;
padding: 0;
border-radius: 0;
color: inherit;
}
/* Tables */
table {
width: 100%;
border-collapse: collapse;
margin: 1em 0;
font-size: 0.95em;
}
th, td {
padding: 8px 12px;
border: 1px solid #d0d7de;
text-align: left;
vertical-align: top;
}
th {
background: #f6f8fa;
font-weight: 700;
color: #111;
}
tr:nth-child(even) { background: #fafbfc; }
caption {
caption-side: bottom;
padding: 8px;
font-size: 0.9em;
color: #666;
font-style: italic;
}
/* Horizontal rule */
hr {
border: none;
border-top: 1px solid #e5e5e5;
margin: 2em 0;
}
/* Images embedded in documents */
img {
max-width: 100%;
height: auto;
border-radius: 4px;
margin: 1em 0;
}
/* Definition lists */
dl { margin-bottom: 1em; }
dt { font-weight: 700; margin-top: 0.5em; }
dd { margin-left: 2em; margin-bottom: 0.5em; }
/* Figure */
figure { margin: 1.5em 0; text-align: center; }
figcaption { font-size: 0.9em; color: #666; margin-top: 0.5em; font-style: italic; }
/* First element shouldn't have top margin */
body > *:first-child { margin-top: 0; }
</style>
</head>
<body>
${bodyHtml}
</body>
</html>`;
}
function escapeHtml(text: string): string {
return text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;');
}
/* ============================================
Source → HTML conversions
============================================ */
async function docxToHtml(file: File): Promise<string> {
const mammoth = await import('mammoth');
const arrayBuffer = await readFileAsArrayBuffer(file);
const result = await mammoth.convertToHtml({ arrayBuffer });
const result = await mammoth.convertToHtml({
arrayBuffer,
});
return result.value;
}
@@ -46,50 +230,271 @@ function htmlToText(html: string): string {
}
function htmlToMarkdown(html: string): string {
let md = html;
md = md.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n');
md = md.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n');
md = md.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n');
md = md.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n');
md = md.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
md = md.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
md = md.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
md = md.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
md = md.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
md = md.replace(/<br\s*\/?>/gi, '\n');
md = md.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
md = md.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
md = md.replace(/<[^>]+>/g, '');
md = md.replace(/&nbsp;/g, ' ');
md = md.replace(/&amp;/g, '&');
md = md.replace(/&lt;/g, '<');
md = md.replace(/&gt;/g, '>');
return md.trim();
// Parse properly using DOMParser for reliable conversion
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
function walk(node: Node): string {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent || '';
}
async function textToPdf(text: string): Promise<Blob> {
if (node.nodeType !== Node.ELEMENT_NODE) return '';
const el = node as Element;
const tag = el.tagName.toLowerCase();
const children = Array.from(el.childNodes).map(walk).join('');
switch (tag) {
case 'h1': return `# ${children.trim()}\n\n`;
case 'h2': return `## ${children.trim()}\n\n`;
case 'h3': return `### ${children.trim()}\n\n`;
case 'h4': return `#### ${children.trim()}\n\n`;
case 'h5': return `##### ${children.trim()}\n\n`;
case 'h6': return `###### ${children.trim()}\n\n`;
case 'p': return `${children.trim()}\n\n`;
case 'br': return '\n';
case 'hr': return '\n---\n\n';
case 'strong':
case 'b': return `**${children}**`;
case 'em':
case 'i': return `*${children}*`;
case 'u': return `<u>${children}</u>`;
case 's':
case 'strike':
case 'del': return `~~${children}~~`;
case 'code': return `\`${children}\``;
case 'pre': return `\n\`\`\`\n${children.trim()}\n\`\`\`\n\n`;
case 'blockquote': return children.split('\n').map(l => `> ${l}`).join('\n') + '\n\n';
case 'a': {
const href = el.getAttribute('href') || '';
return `[${children}](${href})`;
}
case 'img': {
const src = el.getAttribute('src') || '';
const alt = el.getAttribute('alt') || '';
return `![${alt}](${src})`;
}
case 'ul': {
const items = Array.from(el.children)
.filter(c => c.tagName.toLowerCase() === 'li')
.map(c => `- ${walk(c).trim()}`)
.join('\n');
return `${items}\n\n`;
}
case 'ol': {
const items = Array.from(el.children)
.filter(c => c.tagName.toLowerCase() === 'li')
.map((c, i) => `${i + 1}. ${walk(c).trim()}`)
.join('\n');
return `${items}\n\n`;
}
case 'li': return children;
case 'table': {
const rows = Array.from(el.querySelectorAll('tr'));
if (rows.length === 0) return children;
const tableData: string[][] = rows.map(row =>
Array.from(row.querySelectorAll('th, td')).map(cell => walk(cell).trim())
);
if (tableData.length === 0) return '';
const colCount = Math.max(...tableData.map(r => r.length));
const colWidths = Array.from({ length: colCount }, (_, i) =>
Math.max(3, ...tableData.map(r => (r[i] || '').length))
);
const formatRow = (row: string[]) =>
'| ' + colWidths.map((w, i) => (row[i] || '').padEnd(w)).join(' | ') + ' |';
const separator = '| ' + colWidths.map(w => '-'.repeat(w)).join(' | ') + ' |';
const lines = [formatRow(tableData[0]), separator, ...tableData.slice(1).map(formatRow)];
return lines.join('\n') + '\n\n';
}
case 'div':
case 'section':
case 'article':
case 'main':
case 'span':
return children;
default:
return children;
}
}
return walk(doc.body).replace(/\n{3,}/g, '\n\n').trim();
}
/* ============================================
HTML → PDF via jsPDF.html()
Renders a styled HTML document into a real
PDF by injecting it into a hidden DOM container
and using jsPDF's html() method (backed by
html2canvas) to capture the visual rendering.
============================================ */
async function renderHtmlToPdf(htmlContent: string): Promise<Blob> {
const { jsPDF } = await import('jspdf');
// html2canvas-pro is imported for its side-effect:
// jsPDF.html() looks for it on the window/global scope
const html2canvas = (await import('html2canvas-pro')).default;
// Create a hidden container for rendering
const container = document.createElement('div');
container.style.position = 'fixed';
container.style.left = '-10000px';
container.style.top = '0';
container.style.width = '794px'; // A4 width in px at 96dpi
container.style.background = '#ffffff';
container.style.zIndex = '-9999';
// Parse the HTML and inject just the body + styles
const parser = new DOMParser();
const parsed = parser.parseFromString(htmlContent, 'text/html');
// Apply styles inline
const styleEl = parsed.querySelector('style');
const bodyContent = parsed.body.innerHTML;
if (styleEl) {
const style = document.createElement('style');
style.textContent = styleEl.textContent;
container.appendChild(style);
}
const content = document.createElement('div');
content.innerHTML = bodyContent;
content.style.padding = '40px';
content.style.fontFamily = "'Segoe UI', -apple-system, BlinkMacSystemFont, 'Helvetica Neue', Arial, sans-serif";
content.style.fontSize = '14px';
content.style.lineHeight = '1.7';
content.style.color = '#1a1a1a';
container.appendChild(content);
document.body.appendChild(container);
// Wait for fonts/images to load
await new Promise((resolve) => setTimeout(resolve, 100));
try {
// A4 dimensions in mm: 210 x 297
const pdfWidth = 210;
const pdfHeight = 297;
const margin = 15; // mm
// Capture the rendered content as a canvas
const canvas = await html2canvas(content, {
scale: 2, // Higher resolution
useCORS: true,
allowTaint: true,
backgroundColor: '#ffffff',
width: 794,
windowWidth: 794,
});
// Calculate how the content maps to PDF pages
const imgWidth = pdfWidth - margin * 2;
const imgHeight = (canvas.height * imgWidth) / canvas.width;
const doc = new jsPDF('p', 'mm', 'a4');
const pageContentHeight = pdfHeight - margin * 2;
if (imgHeight <= pageContentHeight) {
// Single page — fits entirely
doc.addImage(
canvas.toDataURL('image/jpeg', 0.95),
'JPEG',
margin,
margin,
imgWidth,
imgHeight
);
} else {
// Multi-page — slice the canvas into page-sized chunks
const totalPages = Math.ceil(imgHeight / pageContentHeight);
for (let page = 0; page < totalPages; page++) {
if (page > 0) doc.addPage();
// Calculate the portion of the source canvas for this page
const sourceY = (page * pageContentHeight * canvas.width) / imgWidth;
const sourceHeight = Math.min(
(pageContentHeight * canvas.width) / imgWidth,
canvas.height - sourceY
);
// Create a canvas slice for this page
const pageCanvas = document.createElement('canvas');
pageCanvas.width = canvas.width;
pageCanvas.height = sourceHeight;
const ctx = pageCanvas.getContext('2d');
if (ctx) {
ctx.fillStyle = '#ffffff';
ctx.fillRect(0, 0, pageCanvas.width, pageCanvas.height);
ctx.drawImage(
canvas,
0, sourceY,
canvas.width, sourceHeight,
0, 0,
canvas.width, sourceHeight
);
}
const sliceHeight = (sourceHeight * imgWidth) / canvas.width;
doc.addImage(
pageCanvas.toDataURL('image/jpeg', 0.95),
'JPEG',
margin,
margin,
imgWidth,
sliceHeight
);
}
}
return doc.output('blob');
} finally {
document.body.removeChild(container);
}
}
/* ============================================
Plain text → PDF (for .txt files)
Still uses jsPDF.text() since plain text
has no formatting to preserve.
============================================ */
async function plainTextToPdf(text: string): Promise<Blob> {
const { jsPDF } = await import('jspdf');
const doc = new jsPDF();
const lines = doc.splitTextToSize(text, 180);
let y = 15;
doc.setFont('courier', 'normal');
doc.setFontSize(11);
const lines = doc.splitTextToSize(text, 170);
let y = 20;
const pageHeight = doc.internal.pageSize.getHeight();
for (const line of lines) {
if (y > pageHeight - 15) {
if (y > pageHeight - 20) {
doc.addPage();
y = 15;
y = 20;
}
doc.text(line, 15, y);
y += 7;
doc.text(line, 20, y);
y += 6;
}
return doc.output('blob');
}
async function htmlToPdf(html: string): Promise<Blob> {
const text = htmlToText(html);
return textToPdf(text);
}
/* ============================================
PDF → Text extraction
============================================ */
async function pdfToText(file: File): Promise<string> {
const { PDFDocument } = await import('pdf-lib');
@@ -117,6 +522,10 @@ async function pdfToText(file: File): Promise<string> {
return text;
}
/* ============================================
Main export
============================================ */
export async function convertDocument(
file: File,
targetFormat: string,
@@ -132,14 +541,18 @@ export async function convertDocument(
switch (sourceExt) {
case 'docx': {
if (targetFormat === 'html') {
const html = await docxToHtml(file);
resultBlob = new Blob([html], { type: 'text/html' });
const bodyHtml = await docxToHtml(file);
const styledHtml = wrapInStyledHtml(bodyHtml, file.name);
resultBlob = new Blob([styledHtml], { type: 'text/html' });
} else if (targetFormat === 'txt') {
const text = await docxToText(file);
resultBlob = new Blob([text], { type: 'text/plain' });
} else if (targetFormat === 'pdf') {
const html = await docxToHtml(file);
resultBlob = await htmlToPdf(html);
onProgress?.(40);
const bodyHtml = await docxToHtml(file);
const styledHtml = wrapInStyledHtml(bodyHtml, file.name);
onProgress?.(60);
resultBlob = await renderHtmlToPdf(styledHtml);
} else {
throw new Error(`Unsupported: docx to ${targetFormat}`);
}
@@ -149,12 +562,20 @@ export async function convertDocument(
case 'md': {
const mdText = await readFileAsText(file);
if (targetFormat === 'html') {
const html = await markdownToHtml(mdText);
resultBlob = new Blob([html], { type: 'text/html' });
const bodyHtml = await markdownToHtml(mdText);
const styledHtml = wrapInStyledHtml(bodyHtml, file.name);
resultBlob = new Blob([styledHtml], { type: 'text/html' });
} else if (targetFormat === 'pdf') {
resultBlob = await textToPdf(mdText);
onProgress?.(40);
const bodyHtml = await markdownToHtml(mdText);
const styledHtml = wrapInStyledHtml(bodyHtml, file.name);
onProgress?.(60);
resultBlob = await renderHtmlToPdf(styledHtml);
} else if (targetFormat === 'txt') {
resultBlob = new Blob([mdText], { type: 'text/plain' });
// Strip markdown syntax for plain text
const bodyHtml = await markdownToHtml(mdText);
const text = htmlToText(bodyHtml);
resultBlob = new Blob([text], { type: 'text/plain' });
} else {
throw new Error(`Unsupported: md to ${targetFormat}`);
}
@@ -163,14 +584,20 @@ export async function convertDocument(
case 'html':
case 'htm': {
const html = await readFileAsText(file);
const rawHtml = await readFileAsText(file);
if (targetFormat === 'pdf') {
resultBlob = await htmlToPdf(html);
onProgress?.(40);
// If the HTML already has a <style> or is a full document, use as-is
// Otherwise wrap it in our styled wrapper
const hasFullDoc = rawHtml.toLowerCase().includes('<!doctype') || rawHtml.toLowerCase().includes('<html');
const htmlForPdf = hasFullDoc ? rawHtml : wrapInStyledHtml(rawHtml, file.name);
onProgress?.(60);
resultBlob = await renderHtmlToPdf(htmlForPdf);
} else if (targetFormat === 'txt') {
const text = htmlToText(html);
const text = htmlToText(rawHtml);
resultBlob = new Blob([text], { type: 'text/plain' });
} else if (targetFormat === 'md') {
const md = htmlToMarkdown(html);
const md = htmlToMarkdown(rawHtml);
resultBlob = new Blob([md], { type: 'text/markdown' });
} else {
throw new Error(`Unsupported: html to ${targetFormat}`);
@@ -181,10 +608,11 @@ export async function convertDocument(
case 'txt': {
const text = await readFileAsText(file);
if (targetFormat === 'pdf') {
resultBlob = await textToPdf(text);
resultBlob = await plainTextToPdf(text);
} else if (targetFormat === 'html') {
const html = `<!DOCTYPE html><html><head><meta charset="utf-8"></head><body><pre>${text.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</pre></body></html>`;
resultBlob = new Blob([html], { type: 'text/html' });
const bodyHtml = `<pre><code>${escapeHtml(text)}</code></pre>`;
const styledHtml = wrapInStyledHtml(bodyHtml, file.name);
resultBlob = new Blob([styledHtml], { type: 'text/html' });
} else if (targetFormat === 'md') {
resultBlob = new Blob([text], { type: 'text/markdown' });
} else {