import { parseAsync } from "docx-preview"; import JSZip from "jszip"; import * as pdfjsLib from "pdfjs-dist"; import * as XLSX from "xlsx"; import { i18n } from "./i18n.js"; // Configure PDF.js worker - we'll need to bundle this pdfjsLib.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString(); /** * Load an attachment from various sources * @param source - URL string, File, Blob, or ArrayBuffer * @param fileName - Optional filename override * @returns Promise * @throws Error if loading fails */ export async function loadAttachment(source, fileName) { let arrayBuffer; let detectedFileName = fileName || "unnamed"; let mimeType = "application/octet-stream"; let size = 0; // Convert source to ArrayBuffer if (typeof source === "string") { // It's a URL - fetch it const response = await fetch(source); if (!response.ok) { throw new Error(i18n("Failed to fetch file")); } arrayBuffer = await response.arrayBuffer(); size = arrayBuffer.byteLength; mimeType = response.headers.get("content-type") || mimeType; if (!fileName) { // Try to extract filename from URL const urlParts = source.split("/"); detectedFileName = urlParts[urlParts.length - 1] || "document"; } } else if (source instanceof File) { arrayBuffer = await source.arrayBuffer(); size = source.size; mimeType = source.type || mimeType; detectedFileName = fileName || source.name; } else if (source instanceof Blob) { arrayBuffer = await source.arrayBuffer(); size = source.size; mimeType = source.type || mimeType; } else if (source instanceof ArrayBuffer) { arrayBuffer = source; size = source.byteLength; } else { throw new Error(i18n("Invalid source type")); } // Convert ArrayBuffer to base64 - handle large files properly const uint8Array = new Uint8Array(arrayBuffer); let binary = ""; const chunkSize = 0x8000; // Process in 32KB chunks to avoid stack overflow for (let i = 0; i < uint8Array.length; i += chunkSize) { const chunk = uint8Array.slice(i, i + chunkSize); binary += String.fromCharCode(...chunk); } const base64Content = btoa(binary); // Detect type and process accordingly const id = `${detectedFileName}_${Date.now()}_${Math.random()}`; // Check if it's a PDF if (mimeType === "application/pdf" || detectedFileName.toLowerCase().endsWith(".pdf")) { const { extractedText, preview } = await processPdf(arrayBuffer, detectedFileName); return { id, type: "document", fileName: detectedFileName, mimeType: "application/pdf", size, content: base64Content, extractedText, preview, }; } // Check if it's a DOCX file if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || detectedFileName.toLowerCase().endsWith(".docx")) { const { extractedText } = await processDocx(arrayBuffer, detectedFileName); return { id, type: "document", fileName: detectedFileName, mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", size, content: base64Content, extractedText, }; } // Check if it's a PPTX file if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || detectedFileName.toLowerCase().endsWith(".pptx")) { const { extractedText } = await processPptx(arrayBuffer, detectedFileName); return { id, type: "document", fileName: detectedFileName, mimeType: "application/vnd.openxmlformats-officedocument.presentationml.presentation", size, content: base64Content, extractedText, }; } // Check if it's an Excel file (XLSX/XLS) const excelMimeTypes = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel", ]; if (excelMimeTypes.includes(mimeType) || detectedFileName.toLowerCase().endsWith(".xlsx") || detectedFileName.toLowerCase().endsWith(".xls")) { const { extractedText } = await processExcel(arrayBuffer, detectedFileName); return { id, type: "document", fileName: detectedFileName, mimeType: mimeType.startsWith("application/vnd") ? mimeType : "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", size, content: base64Content, extractedText, }; } // Check if it's an image if (mimeType.startsWith("image/")) { return { id, type: "image", fileName: detectedFileName, mimeType, size, content: base64Content, preview: base64Content, // For images, preview is the same as content }; } // Check if it's a text document const textExtensions = [ ".txt", ".md", ".json", ".xml", ".html", ".css", ".js", ".ts", ".jsx", ".tsx", ".yml", ".yaml", ]; const isTextFile = mimeType.startsWith("text/") || textExtensions.some((ext) => detectedFileName.toLowerCase().endsWith(ext)); if (isTextFile) { const decoder = new TextDecoder(); const text = decoder.decode(arrayBuffer); return { id, type: "document", fileName: detectedFileName, mimeType: mimeType.startsWith("text/") ? mimeType : "text/plain", size, content: base64Content, extractedText: text, }; } throw new Error(`Unsupported file type: ${mimeType}`); } async function processPdf(arrayBuffer, fileName) { let pdf = null; try { pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise; // Extract text with page structure let extractedText = ``; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item) => item.str) .filter((str) => str.trim()) .join(" "); extractedText += `\n\n${pageText}\n`; } extractedText += "\n"; // Generate preview from first page const preview = await generatePdfPreview(pdf); return { extractedText, preview }; } catch (error) { console.error("Error processing PDF:", error); throw new Error(`Failed to process PDF: ${String(error)}`); } finally { // Clean up PDF resources if (pdf) { pdf.destroy(); } } } async function generatePdfPreview(pdf) { try { const page = await pdf.getPage(1); const viewport = page.getViewport({ scale: 1.0 }); // Create canvas with reasonable size for thumbnail (160x160 max) const scale = Math.min(160 / viewport.width, 160 / viewport.height); const scaledViewport = page.getViewport({ scale }); const canvas = document.createElement("canvas"); const context = canvas.getContext("2d"); if (!context) { return undefined; } canvas.height = scaledViewport.height; canvas.width = scaledViewport.width; const renderContext = { canvasContext: context, viewport: scaledViewport, canvas: canvas, }; await page.render(renderContext).promise; // Return base64 without data URL prefix return canvas.toDataURL("image/png").split(",")[1]; } catch (error) { console.error("Error generating PDF preview:", error); return undefined; } } async function processDocx(arrayBuffer, fileName) { try { // Parse document structure const wordDoc = await parseAsync(arrayBuffer); // Extract structured text from document body let extractedText = `\n\n`; const body = wordDoc.documentPart?.body; if (body?.children) { // Walk through document elements and extract text const texts = []; for (const element of body.children) { const text = extractTextFromElement(element); if (text) { texts.push(text); } } extractedText += texts.join("\n"); } extractedText += `\n\n`; return { extractedText }; } catch (error) { console.error("Error processing DOCX:", error); throw new Error(`Failed to process DOCX: ${String(error)}`); } } function extractTextFromElement(element) { let text = ""; // Check type with lowercase const elementType = element.type?.toLowerCase() || ""; // Handle paragraphs if (elementType === "paragraph" && element.children) { for (const child of element.children) { const childType = child.type?.toLowerCase() || ""; if (childType === "run" && child.children) { for (const textChild of child.children) { const textType = textChild.type?.toLowerCase() || ""; if (textType === "text") { text += textChild.text || ""; } } } else if (childType === "text") { text += child.text || ""; } } } // Handle tables else if (elementType === "table") { if (element.children) { const tableTexts = []; for (const row of element.children) { const rowType = row.type?.toLowerCase() || ""; if (rowType === "tablerow" && row.children) { const rowTexts = []; for (const cell of row.children) { const cellType = cell.type?.toLowerCase() || ""; if (cellType === "tablecell" && cell.children) { const cellTexts = []; for (const cellElement of cell.children) { const cellText = extractTextFromElement(cellElement); if (cellText) cellTexts.push(cellText); } if (cellTexts.length > 0) rowTexts.push(cellTexts.join(" ")); } } if (rowTexts.length > 0) tableTexts.push(rowTexts.join(" | ")); } } if (tableTexts.length > 0) { text = "\n[Table]\n" + tableTexts.join("\n") + "\n[/Table]\n"; } } } // Recursively handle other container elements else if (element.children && Array.isArray(element.children)) { const childTexts = []; for (const child of element.children) { const childText = extractTextFromElement(child); if (childText) childTexts.push(childText); } text = childTexts.join(" "); } return text.trim(); } async function processPptx(arrayBuffer, fileName) { try { // Load the PPTX file as a ZIP const zip = await JSZip.loadAsync(arrayBuffer); // PPTX slides are stored in ppt/slides/slide[n].xml let extractedText = ``; // Get all slide files and sort them numerically const slideFiles = Object.keys(zip.files) .filter((name) => name.match(/ppt\/slides\/slide\d+\.xml$/)) .sort((a, b) => { const numA = Number.parseInt(a.match(/slide(\d+)\.xml$/)?.[1] || "0", 10); const numB = Number.parseInt(b.match(/slide(\d+)\.xml$/)?.[1] || "0", 10); return numA - numB; }); // Extract text from each slide for (let i = 0; i < slideFiles.length; i++) { const slideFile = zip.file(slideFiles[i]); if (slideFile) { const slideXml = await slideFile.async("text"); // Extract text from XML (simple regex approach) // Looking for tags which contain text in PPTX const textMatches = slideXml.match(/]*>([^<]+)<\/a:t>/g); if (textMatches) { extractedText += `\n`; const slideTexts = textMatches .map((match) => { const textMatch = match.match(/]*>([^<]+)<\/a:t>/); return textMatch ? textMatch[1] : ""; }) .filter((t) => t.trim()); if (slideTexts.length > 0) { extractedText += "\n" + slideTexts.join("\n"); } extractedText += "\n"; } } } // Also try to extract text from notes const notesFiles = Object.keys(zip.files) .filter((name) => name.match(/ppt\/notesSlides\/notesSlide\d+\.xml$/)) .sort((a, b) => { const numA = Number.parseInt(a.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10); const numB = Number.parseInt(b.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10); return numA - numB; }); if (notesFiles.length > 0) { extractedText += "\n"; for (const noteFile of notesFiles) { const file = zip.file(noteFile); if (file) { const noteXml = await file.async("text"); const textMatches = noteXml.match(/]*>([^<]+)<\/a:t>/g); if (textMatches) { const noteTexts = textMatches .map((match) => { const textMatch = match.match(/]*>([^<]+)<\/a:t>/); return textMatch ? textMatch[1] : ""; }) .filter((t) => t.trim()); if (noteTexts.length > 0) { const slideNum = noteFile.match(/notesSlide(\d+)\.xml$/)?.[1]; extractedText += `\n[Slide ${slideNum} notes]: ${noteTexts.join(" ")}`; } } } } extractedText += "\n"; } extractedText += "\n"; return { extractedText }; } catch (error) { console.error("Error processing PPTX:", error); throw new Error(`Failed to process PPTX: ${String(error)}`); } } async function processExcel(arrayBuffer, fileName) { try { // Read the workbook const workbook = XLSX.read(arrayBuffer, { type: "array" }); let extractedText = ``; // Process each sheet for (const [index, sheetName] of workbook.SheetNames.entries()) { const worksheet = workbook.Sheets[sheetName]; // Extract text as CSV for the extractedText field const csvText = XLSX.utils.sheet_to_csv(worksheet); extractedText += `\n\n${csvText}\n`; } extractedText += "\n"; return { extractedText }; } catch (error) { console.error("Error processing Excel:", error); throw new Error(`Failed to process Excel: ${String(error)}`); } } //# sourceMappingURL=attachment-utils.js.map