415 lines
16 KiB
JavaScript
415 lines
16 KiB
JavaScript
import { parseAsync } from "docx-preview";
|
|
import JSZip from "jszip";
|
|
import * as pdfjsLib from "pdfjs-dist";
|
|
import * as XLSX from "xlsx";
|
|
import { i18n } from "./i18n.js";
|
|
// Configure PDF.js worker - we'll need to bundle this
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString();
|
|
/**
|
|
* Load an attachment from various sources
|
|
* @param source - URL string, File, Blob, or ArrayBuffer
|
|
* @param fileName - Optional filename override
|
|
* @returns Promise<Attachment>
|
|
* @throws Error if loading fails
|
|
*/
|
|
export async function loadAttachment(source, fileName) {
|
|
let arrayBuffer;
|
|
let detectedFileName = fileName || "unnamed";
|
|
let mimeType = "application/octet-stream";
|
|
let size = 0;
|
|
// Convert source to ArrayBuffer
|
|
if (typeof source === "string") {
|
|
// It's a URL - fetch it
|
|
const response = await fetch(source);
|
|
if (!response.ok) {
|
|
throw new Error(i18n("Failed to fetch file"));
|
|
}
|
|
arrayBuffer = await response.arrayBuffer();
|
|
size = arrayBuffer.byteLength;
|
|
mimeType = response.headers.get("content-type") || mimeType;
|
|
if (!fileName) {
|
|
// Try to extract filename from URL
|
|
const urlParts = source.split("/");
|
|
detectedFileName = urlParts[urlParts.length - 1] || "document";
|
|
}
|
|
}
|
|
else if (source instanceof File) {
|
|
arrayBuffer = await source.arrayBuffer();
|
|
size = source.size;
|
|
mimeType = source.type || mimeType;
|
|
detectedFileName = fileName || source.name;
|
|
}
|
|
else if (source instanceof Blob) {
|
|
arrayBuffer = await source.arrayBuffer();
|
|
size = source.size;
|
|
mimeType = source.type || mimeType;
|
|
}
|
|
else if (source instanceof ArrayBuffer) {
|
|
arrayBuffer = source;
|
|
size = source.byteLength;
|
|
}
|
|
else {
|
|
throw new Error(i18n("Invalid source type"));
|
|
}
|
|
// Convert ArrayBuffer to base64 - handle large files properly
|
|
const uint8Array = new Uint8Array(arrayBuffer);
|
|
let binary = "";
|
|
const chunkSize = 0x8000; // Process in 32KB chunks to avoid stack overflow
|
|
for (let i = 0; i < uint8Array.length; i += chunkSize) {
|
|
const chunk = uint8Array.slice(i, i + chunkSize);
|
|
binary += String.fromCharCode(...chunk);
|
|
}
|
|
const base64Content = btoa(binary);
|
|
// Detect type and process accordingly
|
|
const id = `${detectedFileName}_${Date.now()}_${Math.random()}`;
|
|
// Check if it's a PDF
|
|
if (mimeType === "application/pdf" || detectedFileName.toLowerCase().endsWith(".pdf")) {
|
|
const { extractedText, preview } = await processPdf(arrayBuffer, detectedFileName);
|
|
return {
|
|
id,
|
|
type: "document",
|
|
fileName: detectedFileName,
|
|
mimeType: "application/pdf",
|
|
size,
|
|
content: base64Content,
|
|
extractedText,
|
|
preview,
|
|
};
|
|
}
|
|
// Check if it's a DOCX file
|
|
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
|
detectedFileName.toLowerCase().endsWith(".docx")) {
|
|
const { extractedText } = await processDocx(arrayBuffer, detectedFileName);
|
|
return {
|
|
id,
|
|
type: "document",
|
|
fileName: detectedFileName,
|
|
mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
size,
|
|
content: base64Content,
|
|
extractedText,
|
|
};
|
|
}
|
|
// Check if it's a PPTX file
|
|
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
|
|
detectedFileName.toLowerCase().endsWith(".pptx")) {
|
|
const { extractedText } = await processPptx(arrayBuffer, detectedFileName);
|
|
return {
|
|
id,
|
|
type: "document",
|
|
fileName: detectedFileName,
|
|
mimeType: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
size,
|
|
content: base64Content,
|
|
extractedText,
|
|
};
|
|
}
|
|
// Check if it's an Excel file (XLSX/XLS)
|
|
const excelMimeTypes = [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"application/vnd.ms-excel",
|
|
];
|
|
if (excelMimeTypes.includes(mimeType) ||
|
|
detectedFileName.toLowerCase().endsWith(".xlsx") ||
|
|
detectedFileName.toLowerCase().endsWith(".xls")) {
|
|
const { extractedText } = await processExcel(arrayBuffer, detectedFileName);
|
|
return {
|
|
id,
|
|
type: "document",
|
|
fileName: detectedFileName,
|
|
mimeType: mimeType.startsWith("application/vnd")
|
|
? mimeType
|
|
: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
size,
|
|
content: base64Content,
|
|
extractedText,
|
|
};
|
|
}
|
|
// Check if it's an image
|
|
if (mimeType.startsWith("image/")) {
|
|
return {
|
|
id,
|
|
type: "image",
|
|
fileName: detectedFileName,
|
|
mimeType,
|
|
size,
|
|
content: base64Content,
|
|
preview: base64Content, // For images, preview is the same as content
|
|
};
|
|
}
|
|
// Check if it's a text document
|
|
const textExtensions = [
|
|
".txt",
|
|
".md",
|
|
".json",
|
|
".xml",
|
|
".html",
|
|
".css",
|
|
".js",
|
|
".ts",
|
|
".jsx",
|
|
".tsx",
|
|
".yml",
|
|
".yaml",
|
|
];
|
|
const isTextFile = mimeType.startsWith("text/") || textExtensions.some((ext) => detectedFileName.toLowerCase().endsWith(ext));
|
|
if (isTextFile) {
|
|
const decoder = new TextDecoder();
|
|
const text = decoder.decode(arrayBuffer);
|
|
return {
|
|
id,
|
|
type: "document",
|
|
fileName: detectedFileName,
|
|
mimeType: mimeType.startsWith("text/") ? mimeType : "text/plain",
|
|
size,
|
|
content: base64Content,
|
|
extractedText: text,
|
|
};
|
|
}
|
|
throw new Error(`Unsupported file type: ${mimeType}`);
|
|
}
|
|
async function processPdf(arrayBuffer, fileName) {
|
|
let pdf = null;
|
|
try {
|
|
pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
|
|
// Extract text with page structure
|
|
let extractedText = `<pdf filename="${fileName}">`;
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
const page = await pdf.getPage(i);
|
|
const textContent = await page.getTextContent();
|
|
const pageText = textContent.items
|
|
.map((item) => item.str)
|
|
.filter((str) => str.trim())
|
|
.join(" ");
|
|
extractedText += `\n<page number="${i}">\n${pageText}\n</page>`;
|
|
}
|
|
extractedText += "\n</pdf>";
|
|
// Generate preview from first page
|
|
const preview = await generatePdfPreview(pdf);
|
|
return { extractedText, preview };
|
|
}
|
|
catch (error) {
|
|
console.error("Error processing PDF:", error);
|
|
throw new Error(`Failed to process PDF: ${String(error)}`);
|
|
}
|
|
finally {
|
|
// Clean up PDF resources
|
|
if (pdf) {
|
|
pdf.destroy();
|
|
}
|
|
}
|
|
}
|
|
async function generatePdfPreview(pdf) {
|
|
try {
|
|
const page = await pdf.getPage(1);
|
|
const viewport = page.getViewport({ scale: 1.0 });
|
|
// Create canvas with reasonable size for thumbnail (160x160 max)
|
|
const scale = Math.min(160 / viewport.width, 160 / viewport.height);
|
|
const scaledViewport = page.getViewport({ scale });
|
|
const canvas = document.createElement("canvas");
|
|
const context = canvas.getContext("2d");
|
|
if (!context) {
|
|
return undefined;
|
|
}
|
|
canvas.height = scaledViewport.height;
|
|
canvas.width = scaledViewport.width;
|
|
const renderContext = {
|
|
canvasContext: context,
|
|
viewport: scaledViewport,
|
|
canvas: canvas,
|
|
};
|
|
await page.render(renderContext).promise;
|
|
// Return base64 without data URL prefix
|
|
return canvas.toDataURL("image/png").split(",")[1];
|
|
}
|
|
catch (error) {
|
|
console.error("Error generating PDF preview:", error);
|
|
return undefined;
|
|
}
|
|
}
|
|
async function processDocx(arrayBuffer, fileName) {
|
|
try {
|
|
// Parse document structure
|
|
const wordDoc = await parseAsync(arrayBuffer);
|
|
// Extract structured text from document body
|
|
let extractedText = `<docx filename="${fileName}">\n<page number="1">\n`;
|
|
const body = wordDoc.documentPart?.body;
|
|
if (body?.children) {
|
|
// Walk through document elements and extract text
|
|
const texts = [];
|
|
for (const element of body.children) {
|
|
const text = extractTextFromElement(element);
|
|
if (text) {
|
|
texts.push(text);
|
|
}
|
|
}
|
|
extractedText += texts.join("\n");
|
|
}
|
|
extractedText += `\n</page>\n</docx>`;
|
|
return { extractedText };
|
|
}
|
|
catch (error) {
|
|
console.error("Error processing DOCX:", error);
|
|
throw new Error(`Failed to process DOCX: ${String(error)}`);
|
|
}
|
|
}
|
|
function extractTextFromElement(element) {
|
|
let text = "";
|
|
// Check type with lowercase
|
|
const elementType = element.type?.toLowerCase() || "";
|
|
// Handle paragraphs
|
|
if (elementType === "paragraph" && element.children) {
|
|
for (const child of element.children) {
|
|
const childType = child.type?.toLowerCase() || "";
|
|
if (childType === "run" && child.children) {
|
|
for (const textChild of child.children) {
|
|
const textType = textChild.type?.toLowerCase() || "";
|
|
if (textType === "text") {
|
|
text += textChild.text || "";
|
|
}
|
|
}
|
|
}
|
|
else if (childType === "text") {
|
|
text += child.text || "";
|
|
}
|
|
}
|
|
}
|
|
// Handle tables
|
|
else if (elementType === "table") {
|
|
if (element.children) {
|
|
const tableTexts = [];
|
|
for (const row of element.children) {
|
|
const rowType = row.type?.toLowerCase() || "";
|
|
if (rowType === "tablerow" && row.children) {
|
|
const rowTexts = [];
|
|
for (const cell of row.children) {
|
|
const cellType = cell.type?.toLowerCase() || "";
|
|
if (cellType === "tablecell" && cell.children) {
|
|
const cellTexts = [];
|
|
for (const cellElement of cell.children) {
|
|
const cellText = extractTextFromElement(cellElement);
|
|
if (cellText)
|
|
cellTexts.push(cellText);
|
|
}
|
|
if (cellTexts.length > 0)
|
|
rowTexts.push(cellTexts.join(" "));
|
|
}
|
|
}
|
|
if (rowTexts.length > 0)
|
|
tableTexts.push(rowTexts.join(" | "));
|
|
}
|
|
}
|
|
if (tableTexts.length > 0) {
|
|
text = "\n[Table]\n" + tableTexts.join("\n") + "\n[/Table]\n";
|
|
}
|
|
}
|
|
}
|
|
// Recursively handle other container elements
|
|
else if (element.children && Array.isArray(element.children)) {
|
|
const childTexts = [];
|
|
for (const child of element.children) {
|
|
const childText = extractTextFromElement(child);
|
|
if (childText)
|
|
childTexts.push(childText);
|
|
}
|
|
text = childTexts.join(" ");
|
|
}
|
|
return text.trim();
|
|
}
|
|
async function processPptx(arrayBuffer, fileName) {
|
|
try {
|
|
// Load the PPTX file as a ZIP
|
|
const zip = await JSZip.loadAsync(arrayBuffer);
|
|
// PPTX slides are stored in ppt/slides/slide[n].xml
|
|
let extractedText = `<pptx filename="${fileName}">`;
|
|
// Get all slide files and sort them numerically
|
|
const slideFiles = Object.keys(zip.files)
|
|
.filter((name) => name.match(/ppt\/slides\/slide\d+\.xml$/))
|
|
.sort((a, b) => {
|
|
const numA = Number.parseInt(a.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
|
|
const numB = Number.parseInt(b.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
|
|
return numA - numB;
|
|
});
|
|
// Extract text from each slide
|
|
for (let i = 0; i < slideFiles.length; i++) {
|
|
const slideFile = zip.file(slideFiles[i]);
|
|
if (slideFile) {
|
|
const slideXml = await slideFile.async("text");
|
|
// Extract text from XML (simple regex approach)
|
|
// Looking for <a:t> tags which contain text in PPTX
|
|
const textMatches = slideXml.match(/<a:t[^>]*>([^<]+)<\/a:t>/g);
|
|
if (textMatches) {
|
|
extractedText += `\n<slide number="${i + 1}">`;
|
|
const slideTexts = textMatches
|
|
.map((match) => {
|
|
const textMatch = match.match(/<a:t[^>]*>([^<]+)<\/a:t>/);
|
|
return textMatch ? textMatch[1] : "";
|
|
})
|
|
.filter((t) => t.trim());
|
|
if (slideTexts.length > 0) {
|
|
extractedText += "\n" + slideTexts.join("\n");
|
|
}
|
|
extractedText += "\n</slide>";
|
|
}
|
|
}
|
|
}
|
|
// Also try to extract text from notes
|
|
const notesFiles = Object.keys(zip.files)
|
|
.filter((name) => name.match(/ppt\/notesSlides\/notesSlide\d+\.xml$/))
|
|
.sort((a, b) => {
|
|
const numA = Number.parseInt(a.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10);
|
|
const numB = Number.parseInt(b.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10);
|
|
return numA - numB;
|
|
});
|
|
if (notesFiles.length > 0) {
|
|
extractedText += "\n<notes>";
|
|
for (const noteFile of notesFiles) {
|
|
const file = zip.file(noteFile);
|
|
if (file) {
|
|
const noteXml = await file.async("text");
|
|
const textMatches = noteXml.match(/<a:t[^>]*>([^<]+)<\/a:t>/g);
|
|
if (textMatches) {
|
|
const noteTexts = textMatches
|
|
.map((match) => {
|
|
const textMatch = match.match(/<a:t[^>]*>([^<]+)<\/a:t>/);
|
|
return textMatch ? textMatch[1] : "";
|
|
})
|
|
.filter((t) => t.trim());
|
|
if (noteTexts.length > 0) {
|
|
const slideNum = noteFile.match(/notesSlide(\d+)\.xml$/)?.[1];
|
|
extractedText += `\n[Slide ${slideNum} notes]: ${noteTexts.join(" ")}`;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
extractedText += "\n</notes>";
|
|
}
|
|
extractedText += "\n</pptx>";
|
|
return { extractedText };
|
|
}
|
|
catch (error) {
|
|
console.error("Error processing PPTX:", error);
|
|
throw new Error(`Failed to process PPTX: ${String(error)}`);
|
|
}
|
|
}
|
|
async function processExcel(arrayBuffer, fileName) {
|
|
try {
|
|
// Read the workbook
|
|
const workbook = XLSX.read(arrayBuffer, { type: "array" });
|
|
let extractedText = `<excel filename="${fileName}">`;
|
|
// Process each sheet
|
|
for (const [index, sheetName] of workbook.SheetNames.entries()) {
|
|
const worksheet = workbook.Sheets[sheetName];
|
|
// Extract text as CSV for the extractedText field
|
|
const csvText = XLSX.utils.sheet_to_csv(worksheet);
|
|
extractedText += `\n<sheet name="${sheetName}" index="${index + 1}">\n${csvText}\n</sheet>`;
|
|
}
|
|
extractedText += "\n</excel>";
|
|
return { extractedText };
|
|
}
|
|
catch (error) {
|
|
console.error("Error processing Excel:", error);
|
|
throw new Error(`Failed to process Excel: ${String(error)}`);
|
|
}
|
|
}
|
|
//# sourceMappingURL=attachment-utils.js.map
|