fix: use file-type for mime sniffing

This commit is contained in:
Peter Steinberger
2025-12-20 19:13:50 +01:00
parent 1356498ee1
commit 36c85a617a
14 changed files with 5291 additions and 91 deletions

View File

@@ -1,5 +1,6 @@
import path from "node:path";
import { fileTypeFromBuffer } from "file-type";
import { type MediaKind, mediaKindFromMime } from "./constants.js";
// Map common mimes to preferred file extensions.
@@ -12,7 +13,23 @@ const EXT_BY_MIME: Record<string, string> = {
"audio/mpeg": ".mp3",
"video/mp4": ".mp4",
"application/pdf": ".pdf",
"application/json": ".json",
"application/zip": ".zip",
"application/gzip": ".gz",
"application/x-tar": ".tar",
"application/x-7z-compressed": ".7z",
"application/vnd.rar": ".rar",
"application/msword": ".doc",
"application/vnd.ms-excel": ".xls",
"application/vnd.ms-powerpoint": ".ppt",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":
".docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation":
".pptx",
"text/csv": ".csv",
"text/plain": ".txt",
"text/markdown": ".md",
};
const MIME_BY_EXT: Record<string, string> = Object.fromEntries(
@@ -25,71 +42,14 @@ function normalizeHeaderMime(mime?: string | null): string | undefined {
return cleaned || undefined;
}
function sniffMime(buffer?: Buffer): string | undefined {
if (!buffer || buffer.length < 4) return undefined;
// JPEG: FF D8 FF
if (buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
return "image/jpeg";
async function sniffMime(buffer?: Buffer): Promise<string | undefined> {
if (!buffer) return undefined;
try {
const type = await fileTypeFromBuffer(buffer);
return type?.mime ?? undefined;
} catch {
return undefined;
}
// PNG: 89 50 4E 47 0D 0A 1A 0A
if (
buffer.length >= 8 &&
buffer[0] === 0x89 &&
buffer[1] === 0x50 &&
buffer[2] === 0x4e &&
buffer[3] === 0x47 &&
buffer[4] === 0x0d &&
buffer[5] === 0x0a &&
buffer[6] === 0x1a &&
buffer[7] === 0x0a
) {
return "image/png";
}
// GIF: GIF87a / GIF89a
if (buffer.length >= 6) {
const sig = buffer.subarray(0, 6).toString("ascii");
if (sig === "GIF87a" || sig === "GIF89a") return "image/gif";
}
// WebP: RIFF....WEBP
if (
buffer.length >= 12 &&
buffer.subarray(0, 4).toString("ascii") === "RIFF" &&
buffer.subarray(8, 12).toString("ascii") === "WEBP"
) {
return "image/webp";
}
// PDF: %PDF-
if (buffer.subarray(0, 5).toString("ascii") === "%PDF-") {
return "application/pdf";
}
// Ogg / Opus: OggS
if (buffer.subarray(0, 4).toString("ascii") === "OggS") {
return "audio/ogg";
}
// MP3: ID3 tag or frame sync FF E0+.
if (buffer.subarray(0, 3).toString("ascii") === "ID3") {
return "audio/mpeg";
}
if (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) {
return "audio/mpeg";
}
// MP4: "ftyp" at offset 4.
if (
buffer.length >= 12 &&
buffer.subarray(4, 8).toString("ascii") === "ftyp"
) {
return "video/mp4";
}
return undefined;
}
function extFromPath(filePath?: string): string | undefined {
@@ -110,15 +70,34 @@ export function detectMime(opts: {
buffer?: Buffer;
headerMime?: string | null;
filePath?: string;
}): string | undefined {
const sniffed = sniffMime(opts.buffer);
if (sniffed) return sniffed;
}): Promise<string | undefined> {
return detectMimeImpl(opts);
}
function isGenericMime(mime?: string): boolean {
if (!mime) return true;
const m = mime.toLowerCase();
return m === "application/octet-stream" || m === "application/zip";
}
async function detectMimeImpl(opts: {
buffer?: Buffer;
headerMime?: string | null;
filePath?: string;
}): Promise<string | undefined> {
const ext = extFromPath(opts.filePath);
const extMime = ext ? MIME_BY_EXT[ext] : undefined;
const headerMime = normalizeHeaderMime(opts.headerMime);
if (headerMime) return headerMime;
const sniffed = await sniffMime(opts.buffer);
const ext = extFromPath(opts.filePath);
if (ext && MIME_BY_EXT[ext]) return MIME_BY_EXT[ext];
// Prefer sniffed types, but don't let generic container types override a more
// specific extension mapping (e.g. XLSX vs ZIP).
if (sniffed && (!isGenericMime(sniffed) || !extMime)) return sniffed;
if (extMime) return extMime;
if (headerMime && !isGenericMime(headerMime)) return headerMime;
if (sniffed) return sniffed;
if (headerMime) return headerMime;
return undefined;
}

View File

@@ -38,7 +38,7 @@ export function attachMediaRoutes(
return;
}
const data = await fs.readFile(realPath);
const mime = detectMime({ buffer: data, filePath: realPath });
const mime = await detectMime({ buffer: data, filePath: realPath });
if (mime) res.type(mime);
res.send(data);
// best-effort single-use cleanup after response ends

View File

@@ -2,7 +2,16 @@ import fs from "node:fs/promises";
import path from "node:path";
import { PassThrough } from "node:stream";
import { afterAll, beforeAll, describe, expect, it, vi } from "vitest";
import JSZip from "jszip";
import {
afterAll,
beforeAll,
beforeEach,
describe,
expect,
it,
vi,
} from "vitest";
const realOs = await vi.importActual<typeof import("node:os")>("node:os");
const HOME = path.join(realOs.tmpdir(), "clawdis-home-redirect");
@@ -25,6 +34,10 @@ describe("media store redirects", () => {
await fs.rm(HOME, { recursive: true, force: true });
});
beforeEach(() => {
mockRequest.mockReset();
});
afterAll(async () => {
await fs.rm(HOME, { recursive: true, force: true });
vi.clearAllMocks();
@@ -71,4 +84,47 @@ describe("media store redirects", () => {
expect(path.extname(saved.path)).toBe(".txt");
expect(await fs.readFile(saved.path, "utf8")).toBe("redirected");
});
it("sniffs xlsx from zip content when headers and url extension are missing", async () => {
mockRequest.mockImplementationOnce((_url, _opts, cb) => {
const res = new PassThrough();
const req = {
on: (event: string, handler: (...args: unknown[]) => void) => {
if (event === "error") res.on("error", handler);
return req;
},
end: () => undefined,
destroy: () => res.destroy(),
} as const;
res.statusCode = 200;
res.headers = {};
setImmediate(() => {
cb(res as unknown as Parameters<typeof cb>[0]);
const zip = new JSZip();
zip.file(
"[Content_Types].xml",
'<Types><Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/></Types>',
);
zip.file("xl/workbook.xml", "<workbook/>");
void zip
.generateAsync({ type: "nodebuffer" })
.then((buf) => {
res.write(buf);
res.end();
})
.catch((err) => {
res.destroy(err);
});
});
return req;
});
const saved = await saveMediaSource("https://example.com/download");
expect(saved.contentType).toBe(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
);
expect(path.extname(saved.path)).toBe(".xlsx");
});
});

View File

@@ -1,5 +1,6 @@
import fs from "node:fs/promises";
import path from "node:path";
import JSZip from "jszip";
import sharp from "sharp";
import { afterAll, beforeAll, describe, expect, it, vi } from "vitest";
@@ -70,6 +71,18 @@ describe("media store", () => {
await expect(fs.stat(saved.path)).rejects.toThrow();
});
it("sets correct mime for xlsx by extension", async () => {
const xlsxPath = path.join(HOME, "sheet.xlsx");
await fs.mkdir(HOME, { recursive: true });
await fs.writeFile(xlsxPath, "not really an xlsx");
const saved = await store.saveMediaSource(xlsxPath);
expect(saved.contentType).toBe(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
);
expect(path.extname(saved.path)).toBe(".xlsx");
});
it("renames media based on detected mime even when extension is wrong", async () => {
const pngBytes = await sharp({
create: { width: 2, height: 2, channels: 3, background: "#00ff00" },
@@ -86,4 +99,22 @@ describe("media store", () => {
const buf = await fs.readFile(saved.path);
expect(buf.equals(pngBytes)).toBe(true);
});
it("sniffs xlsx mime for zip buffers and renames extension", async () => {
const zip = new JSZip();
zip.file(
"[Content_Types].xml",
'<Types><Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/></Types>',
);
zip.file("xl/workbook.xml", "<workbook/>");
const fakeXlsx = await zip.generateAsync({ type: "nodebuffer" });
const bogusExt = path.join(HOME, "sheet.bin");
await fs.writeFile(bogusExt, fakeXlsx);
const saved = await store.saveMediaSource(bogusExt);
expect(saved.contentType).toBe(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
);
expect(path.extname(saved.path)).toBe(".xlsx");
});
});

View File

@@ -126,7 +126,7 @@ export async function saveMediaSource(
tempDest,
headers,
);
const mime = detectMime({
const mime = await detectMime({
buffer: sniffBuffer,
headerMime,
filePath: source,
@@ -147,7 +147,7 @@ export async function saveMediaSource(
throw new Error("Media exceeds 5MB limit");
}
const buffer = await fs.readFile(source);
const mime = detectMime({ buffer, filePath: source });
const mime = await detectMime({ buffer, filePath: source });
const ext = extensionForMime(mime) ?? path.extname(source);
const id = ext ? `${baseId}${ext}` : baseId;
const dest = path.join(dir, id);
@@ -169,7 +169,7 @@ export async function saveMediaBuffer(
const dir = path.join(MEDIA_DIR, subdir);
await fs.mkdir(dir, { recursive: true });
const baseId = crypto.randomUUID();
const mime = detectMime({ buffer, headerMime: contentType });
const mime = await detectMime({ buffer, headerMime: contentType });
const ext = extensionForMime(mime);
const id = ext ? `${baseId}${ext}` : baseId;
const dest = path.join(dir, id);