Files
clawdbot/apps/macos/Sources/Clawdis/Resources/WebChat/utils/attachment-utils.js
2025-12-06 05:01:28 +01:00

415 lines
16 KiB
JavaScript

import { parseAsync } from "docx-preview";
import JSZip from "jszip";
import * as pdfjsLib from "pdfjs-dist";
import * as XLSX from "xlsx";
import { i18n } from "./i18n.js";
// Configure PDF.js worker - we'll need to bundle this
pdfjsLib.GlobalWorkerOptions.workerSrc = new URL("pdfjs-dist/build/pdf.worker.min.mjs", import.meta.url).toString();
/**
* Load an attachment from various sources
* @param source - URL string, File, Blob, or ArrayBuffer
* @param fileName - Optional filename override
* @returns Promise<Attachment>
* @throws Error if loading fails
*/
export async function loadAttachment(source, fileName) {
let arrayBuffer;
let detectedFileName = fileName || "unnamed";
let mimeType = "application/octet-stream";
let size = 0;
// Convert source to ArrayBuffer
if (typeof source === "string") {
// It's a URL - fetch it
const response = await fetch(source);
if (!response.ok) {
throw new Error(i18n("Failed to fetch file"));
}
arrayBuffer = await response.arrayBuffer();
size = arrayBuffer.byteLength;
mimeType = response.headers.get("content-type") || mimeType;
if (!fileName) {
// Try to extract filename from URL
const urlParts = source.split("/");
detectedFileName = urlParts[urlParts.length - 1] || "document";
}
}
else if (source instanceof File) {
arrayBuffer = await source.arrayBuffer();
size = source.size;
mimeType = source.type || mimeType;
detectedFileName = fileName || source.name;
}
else if (source instanceof Blob) {
arrayBuffer = await source.arrayBuffer();
size = source.size;
mimeType = source.type || mimeType;
}
else if (source instanceof ArrayBuffer) {
arrayBuffer = source;
size = source.byteLength;
}
else {
throw new Error(i18n("Invalid source type"));
}
// Convert ArrayBuffer to base64 - handle large files properly
const uint8Array = new Uint8Array(arrayBuffer);
let binary = "";
const chunkSize = 0x8000; // Process in 32KB chunks to avoid stack overflow
for (let i = 0; i < uint8Array.length; i += chunkSize) {
const chunk = uint8Array.slice(i, i + chunkSize);
binary += String.fromCharCode(...chunk);
}
const base64Content = btoa(binary);
// Detect type and process accordingly
const id = `${detectedFileName}_${Date.now()}_${Math.random()}`;
// Check if it's a PDF
if (mimeType === "application/pdf" || detectedFileName.toLowerCase().endsWith(".pdf")) {
const { extractedText, preview } = await processPdf(arrayBuffer, detectedFileName);
return {
id,
type: "document",
fileName: detectedFileName,
mimeType: "application/pdf",
size,
content: base64Content,
extractedText,
preview,
};
}
// Check if it's a DOCX file
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
detectedFileName.toLowerCase().endsWith(".docx")) {
const { extractedText } = await processDocx(arrayBuffer, detectedFileName);
return {
id,
type: "document",
fileName: detectedFileName,
mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
size,
content: base64Content,
extractedText,
};
}
// Check if it's a PPTX file
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" ||
detectedFileName.toLowerCase().endsWith(".pptx")) {
const { extractedText } = await processPptx(arrayBuffer, detectedFileName);
return {
id,
type: "document",
fileName: detectedFileName,
mimeType: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
size,
content: base64Content,
extractedText,
};
}
// Check if it's an Excel file (XLSX/XLS)
const excelMimeTypes = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel",
];
if (excelMimeTypes.includes(mimeType) ||
detectedFileName.toLowerCase().endsWith(".xlsx") ||
detectedFileName.toLowerCase().endsWith(".xls")) {
const { extractedText } = await processExcel(arrayBuffer, detectedFileName);
return {
id,
type: "document",
fileName: detectedFileName,
mimeType: mimeType.startsWith("application/vnd")
? mimeType
: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
size,
content: base64Content,
extractedText,
};
}
// Check if it's an image
if (mimeType.startsWith("image/")) {
return {
id,
type: "image",
fileName: detectedFileName,
mimeType,
size,
content: base64Content,
preview: base64Content, // For images, preview is the same as content
};
}
// Check if it's a text document
const textExtensions = [
".txt",
".md",
".json",
".xml",
".html",
".css",
".js",
".ts",
".jsx",
".tsx",
".yml",
".yaml",
];
const isTextFile = mimeType.startsWith("text/") || textExtensions.some((ext) => detectedFileName.toLowerCase().endsWith(ext));
if (isTextFile) {
const decoder = new TextDecoder();
const text = decoder.decode(arrayBuffer);
return {
id,
type: "document",
fileName: detectedFileName,
mimeType: mimeType.startsWith("text/") ? mimeType : "text/plain",
size,
content: base64Content,
extractedText: text,
};
}
throw new Error(`Unsupported file type: ${mimeType}`);
}
async function processPdf(arrayBuffer, fileName) {
let pdf = null;
try {
pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
// Extract text with page structure
let extractedText = `<pdf filename="${fileName}">`;
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => item.str)
.filter((str) => str.trim())
.join(" ");
extractedText += `\n<page number="${i}">\n${pageText}\n</page>`;
}
extractedText += "\n</pdf>";
// Generate preview from first page
const preview = await generatePdfPreview(pdf);
return { extractedText, preview };
}
catch (error) {
console.error("Error processing PDF:", error);
throw new Error(`Failed to process PDF: ${String(error)}`);
}
finally {
// Clean up PDF resources
if (pdf) {
pdf.destroy();
}
}
}
async function generatePdfPreview(pdf) {
try {
const page = await pdf.getPage(1);
const viewport = page.getViewport({ scale: 1.0 });
// Create canvas with reasonable size for thumbnail (160x160 max)
const scale = Math.min(160 / viewport.width, 160 / viewport.height);
const scaledViewport = page.getViewport({ scale });
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");
if (!context) {
return undefined;
}
canvas.height = scaledViewport.height;
canvas.width = scaledViewport.width;
const renderContext = {
canvasContext: context,
viewport: scaledViewport,
canvas: canvas,
};
await page.render(renderContext).promise;
// Return base64 without data URL prefix
return canvas.toDataURL("image/png").split(",")[1];
}
catch (error) {
console.error("Error generating PDF preview:", error);
return undefined;
}
}
async function processDocx(arrayBuffer, fileName) {
try {
// Parse document structure
const wordDoc = await parseAsync(arrayBuffer);
// Extract structured text from document body
let extractedText = `<docx filename="${fileName}">\n<page number="1">\n`;
const body = wordDoc.documentPart?.body;
if (body?.children) {
// Walk through document elements and extract text
const texts = [];
for (const element of body.children) {
const text = extractTextFromElement(element);
if (text) {
texts.push(text);
}
}
extractedText += texts.join("\n");
}
extractedText += `\n</page>\n</docx>`;
return { extractedText };
}
catch (error) {
console.error("Error processing DOCX:", error);
throw new Error(`Failed to process DOCX: ${String(error)}`);
}
}
function extractTextFromElement(element) {
let text = "";
// Check type with lowercase
const elementType = element.type?.toLowerCase() || "";
// Handle paragraphs
if (elementType === "paragraph" && element.children) {
for (const child of element.children) {
const childType = child.type?.toLowerCase() || "";
if (childType === "run" && child.children) {
for (const textChild of child.children) {
const textType = textChild.type?.toLowerCase() || "";
if (textType === "text") {
text += textChild.text || "";
}
}
}
else if (childType === "text") {
text += child.text || "";
}
}
}
// Handle tables
else if (elementType === "table") {
if (element.children) {
const tableTexts = [];
for (const row of element.children) {
const rowType = row.type?.toLowerCase() || "";
if (rowType === "tablerow" && row.children) {
const rowTexts = [];
for (const cell of row.children) {
const cellType = cell.type?.toLowerCase() || "";
if (cellType === "tablecell" && cell.children) {
const cellTexts = [];
for (const cellElement of cell.children) {
const cellText = extractTextFromElement(cellElement);
if (cellText)
cellTexts.push(cellText);
}
if (cellTexts.length > 0)
rowTexts.push(cellTexts.join(" "));
}
}
if (rowTexts.length > 0)
tableTexts.push(rowTexts.join(" | "));
}
}
if (tableTexts.length > 0) {
text = "\n[Table]\n" + tableTexts.join("\n") + "\n[/Table]\n";
}
}
}
// Recursively handle other container elements
else if (element.children && Array.isArray(element.children)) {
const childTexts = [];
for (const child of element.children) {
const childText = extractTextFromElement(child);
if (childText)
childTexts.push(childText);
}
text = childTexts.join(" ");
}
return text.trim();
}
async function processPptx(arrayBuffer, fileName) {
try {
// Load the PPTX file as a ZIP
const zip = await JSZip.loadAsync(arrayBuffer);
// PPTX slides are stored in ppt/slides/slide[n].xml
let extractedText = `<pptx filename="${fileName}">`;
// Get all slide files and sort them numerically
const slideFiles = Object.keys(zip.files)
.filter((name) => name.match(/ppt\/slides\/slide\d+\.xml$/))
.sort((a, b) => {
const numA = Number.parseInt(a.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
const numB = Number.parseInt(b.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
return numA - numB;
});
// Extract text from each slide
for (let i = 0; i < slideFiles.length; i++) {
const slideFile = zip.file(slideFiles[i]);
if (slideFile) {
const slideXml = await slideFile.async("text");
// Extract text from XML (simple regex approach)
// Looking for <a:t> tags which contain text in PPTX
const textMatches = slideXml.match(/<a:t[^>]*>([^<]+)<\/a:t>/g);
if (textMatches) {
extractedText += `\n<slide number="${i + 1}">`;
const slideTexts = textMatches
.map((match) => {
const textMatch = match.match(/<a:t[^>]*>([^<]+)<\/a:t>/);
return textMatch ? textMatch[1] : "";
})
.filter((t) => t.trim());
if (slideTexts.length > 0) {
extractedText += "\n" + slideTexts.join("\n");
}
extractedText += "\n</slide>";
}
}
}
// Also try to extract text from notes
const notesFiles = Object.keys(zip.files)
.filter((name) => name.match(/ppt\/notesSlides\/notesSlide\d+\.xml$/))
.sort((a, b) => {
const numA = Number.parseInt(a.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10);
const numB = Number.parseInt(b.match(/notesSlide(\d+)\.xml$/)?.[1] || "0", 10);
return numA - numB;
});
if (notesFiles.length > 0) {
extractedText += "\n<notes>";
for (const noteFile of notesFiles) {
const file = zip.file(noteFile);
if (file) {
const noteXml = await file.async("text");
const textMatches = noteXml.match(/<a:t[^>]*>([^<]+)<\/a:t>/g);
if (textMatches) {
const noteTexts = textMatches
.map((match) => {
const textMatch = match.match(/<a:t[^>]*>([^<]+)<\/a:t>/);
return textMatch ? textMatch[1] : "";
})
.filter((t) => t.trim());
if (noteTexts.length > 0) {
const slideNum = noteFile.match(/notesSlide(\d+)\.xml$/)?.[1];
extractedText += `\n[Slide ${slideNum} notes]: ${noteTexts.join(" ")}`;
}
}
}
}
extractedText += "\n</notes>";
}
extractedText += "\n</pptx>";
return { extractedText };
}
catch (error) {
console.error("Error processing PPTX:", error);
throw new Error(`Failed to process PPTX: ${String(error)}`);
}
}
async function processExcel(arrayBuffer, fileName) {
try {
// Read the workbook
const workbook = XLSX.read(arrayBuffer, { type: "array" });
let extractedText = `<excel filename="${fileName}">`;
// Process each sheet
for (const [index, sheetName] of workbook.SheetNames.entries()) {
const worksheet = workbook.Sheets[sheetName];
// Extract text as CSV for the extractedText field
const csvText = XLSX.utils.sheet_to_csv(worksheet);
extractedText += `\n<sheet name="${sheetName}" index="${index + 1}">\n${csvText}\n</sheet>`;
}
extractedText += "\n</excel>";
return { extractedText };
}
catch (error) {
console.error("Error processing Excel:", error);
throw new Error(`Failed to process Excel: ${String(error)}`);
}
}
//# sourceMappingURL=attachment-utils.js.map