package converter import ( "archive/zip" "fmt" "io" "os" "os/exec" "strings" "github.com/russross/blackfriday/v2" ) func convertDocument(inputPath, outputPath, sourceExt, targetFormat string) error { raw, err := os.ReadFile(inputPath) if err != nil { return fmt.Errorf("reading document: %w", err) } content := string(raw) switch sourceExt { case "txt": return convertFromTxt(content, outputPath, targetFormat) case "md": return convertFromMarkdown(content, outputPath, targetFormat) case "html", "htm": return convertFromHTML(content, outputPath, targetFormat) case "rtf": return convertFromRTF(content, outputPath, targetFormat) case "docx": return convertDocx(inputPath, outputPath, targetFormat) case "pdf": return convertPdf(inputPath, outputPath, targetFormat) default: return fmt.Errorf("unsupported document source: %s", sourceExt) } } // ─── TXT conversions ───────────────────────────────────────── func convertFromTxt(content, outputPath, target string) error { switch target { case "html": html := "
" + escapeHTML(content) + "
" return os.WriteFile(outputPath, []byte(html), 0o644) case "md": return os.WriteFile(outputPath, []byte(content), 0o644) case "pdf": return textToPDF(content, outputPath) default: return fmt.Errorf("unsupported target for txt: %s", target) } } // ─── Markdown conversions ──────────────────────────────────── func convertFromMarkdown(content, outputPath, target string) error { switch target { case "html": html := blackfriday.Run([]byte(content)) wrapped := "" + string(html) + "" return os.WriteFile(outputPath, []byte(wrapped), 0o644) case "txt": text := stripMarkdown(content) return os.WriteFile(outputPath, []byte(text), 0o644) case "pdf": html := string(blackfriday.Run([]byte(content))) return htmlToPDF(html, outputPath) default: return fmt.Errorf("unsupported target for md: %s", target) } } // ─── HTML conversions ──────────────────────────────────────── func convertFromHTML(content, outputPath, target string) error { switch target { case "txt": text := stripHTMLTags(content) return os.WriteFile(outputPath, []byte(text), 0o644) case "md": md := htmlToMarkdown(content) return os.WriteFile(outputPath, []byte(md), 0o644) case "pdf": return htmlToPDF(content, outputPath) default: return fmt.Errorf("unsupported target for html: %s", target) } } // ─── RTF conversions ───────────────────────────────────────── func convertFromRTF(content, outputPath, target string) error { text := stripRTF(content) switch target { case "txt": return os.WriteFile(outputPath, []byte(text), 0o644) case "html": html := "
" + escapeHTML(text) + "
" return os.WriteFile(outputPath, []byte(html), 0o644) case "md": return os.WriteFile(outputPath, []byte(text), 0o644) default: return fmt.Errorf("unsupported target for rtf: %s", target) } } // ─── DOCX conversions ──────────────────────────────────────── func convertDocx(inputPath, outputPath, target string) error { text, err := extractDocxText(inputPath) if err != nil { return fmt.Errorf("extracting DOCX text: %w", err) } switch target { case "txt": return os.WriteFile(outputPath, []byte(text), 0o644) case "html": html := "
" + escapeHTML(text) + "
" return os.WriteFile(outputPath, []byte(html), 0o644) case "md": return os.WriteFile(outputPath, []byte(text), 0o644) case "pdf": return textToPDF(text, outputPath) default: return fmt.Errorf("unsupported target for docx: %s", target) } } // ─── PDF conversions ───────────────────────────────────────── func convertPdf(inputPath, outputPath, target string) error { text, err := extractPDFText(inputPath) if err != nil { return fmt.Errorf("extracting PDF text: %w", err) } switch target { case "txt": return os.WriteFile(outputPath, []byte(text), 0o644) case "html": html := "
" + escapeHTML(text) + "
" return os.WriteFile(outputPath, []byte(html), 0o644) case "md": return os.WriteFile(outputPath, []byte(text), 0o644) default: return fmt.Errorf("unsupported target for pdf: %s", target) } } // ─── Helpers ───────────────────────────────────────────────── func escapeHTML(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") s = strings.ReplaceAll(s, ">", ">") return s } func stripHTMLTags(html string) string { var result strings.Builder inTag := false for _, r := range html { switch { case r == '<': inTag = true case r == '>': inTag = false case !inTag: result.WriteRune(r) } } return strings.TrimSpace(result.String()) } func stripMarkdown(md string) string { lines := strings.Split(md, "\n") var result []string for _, line := range lines { line = strings.TrimLeft(line, "# ") line = strings.ReplaceAll(line, "**", "") line = strings.ReplaceAll(line, "*", "") line = strings.ReplaceAll(line, "__", "") line = strings.ReplaceAll(line, "_", "") line = strings.ReplaceAll(line, "`", "") result = append(result, line) } return strings.Join(result, "\n") } func htmlToMarkdown(html string) string { md := html md = strings.ReplaceAll(md, "
", "\n") md = strings.ReplaceAll(md, "
", "\n") md = strings.ReplaceAll(md, "
", "\n") md = strings.ReplaceAll(md, "

", "\n") md = strings.ReplaceAll(md, "

", "\n") md = strings.ReplaceAll(md, "", "**") md = strings.ReplaceAll(md, "", "**") md = strings.ReplaceAll(md, "", "*") md = strings.ReplaceAll(md, "", "*") md = strings.ReplaceAll(md, "

", "# ") md = strings.ReplaceAll(md, "

", "\n") md = strings.ReplaceAll(md, "

", "## ") md = strings.ReplaceAll(md, "

", "\n") md = strings.ReplaceAll(md, "

", "### ") md = strings.ReplaceAll(md, "

", "\n") md = stripHTMLTags(md) return strings.TrimSpace(md) } func stripRTF(rtf string) string { var result strings.Builder i := 0 depth := 0 for i < len(rtf) { ch := rtf[i] switch { case ch == '{': depth++ i++ case ch == '}': depth-- i++ case ch == '\\': i++ if i < len(rtf) && rtf[i] == '\'' { i += 3 } else { for i < len(rtf) && ((rtf[i] >= 'a' && rtf[i] <= 'z') || (rtf[i] >= 'A' && rtf[i] <= 'Z')) { i++ } for i < len(rtf) && ((rtf[i] >= '0' && rtf[i] <= '9') || rtf[i] == '-') { i++ } if i < len(rtf) && rtf[i] == ' ' { i++ } } default: if depth <= 1 { result.WriteByte(ch) } i++ } } return strings.TrimSpace(result.String()) } // extractDocxText extracts plain text from a .docx file (ZIP of XML files). func extractDocxText(path string) (string, error) { r, err := zip.OpenReader(path) if err != nil { return "", fmt.Errorf("opening docx: %w", err) } defer r.Close() for _, f := range r.File { if f.Name == "word/document.xml" { rc, err := f.Open() if err != nil { return "", err } defer rc.Close() data, err := io.ReadAll(rc) if err != nil { return "", err } return stripHTMLTags(string(data)), nil } } return "", fmt.Errorf("word/document.xml not found in docx") } // extractPDFText tries pdftotext (poppler-utils), falls back to error. func extractPDFText(path string) (string, error) { pdftotextPath, err := exec.LookPath("pdftotext") if err != nil { return "", fmt.Errorf("PDF text extraction requires 'pdftotext' — install poppler-utils") } out, err := exec.Command(pdftotextPath, path, "-").CombinedOutput() if err != nil { return "", fmt.Errorf("pdftotext failed: %w\n%s", err, string(out)) } return string(out), nil } // textToPDF creates a basic PDF from plain text. func textToPDF(text, outputPath string) error { lines := strings.Split(text, "\n") var content strings.Builder content.WriteString("%PDF-1.4\n") content.WriteString("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") content.WriteString("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n") var stream strings.Builder stream.WriteString("BT\n/F1 10 Tf\n") y := 780.0 for _, line := range lines { if y < 40 { break } safe := strings.ReplaceAll(line, "\\", "\\\\") safe = strings.ReplaceAll(safe, "(", "\\(") safe = strings.ReplaceAll(safe, ")", "\\)") stream.WriteString(fmt.Sprintf("1 0 0 1 40 %.0f Tm\n(%s) Tj\n", y, safe)) y -= 14 } stream.WriteString("ET\n") streamBytes := stream.String() content.WriteString(fmt.Sprintf("3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n")) content.WriteString(fmt.Sprintf("4 0 obj\n<< /Length %d >>\nstream\n%sendstream\nendobj\n", len(streamBytes), streamBytes)) content.WriteString("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n") content.WriteString("xref\n0 6\n") content.WriteString("trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n0\n%%EOF\n") return os.WriteFile(outputPath, []byte(content.String()), 0o644) } func htmlToPDF(html, outputPath string) error { text := stripHTMLTags(html) return textToPDF(text, outputPath) }