const { QdrantClient } = require("@qdrant/js-client-rest"); const { pipeline } = require("@xenova/transformers"); const COLLECTION = process.env.QDRANT_COLLECTION const qdrant = new QdrantClient({ url: process.env.QDRANT_URL, checkCompatibility: false, timeout: 30000, }); const _warn = console.warn; console.warn = (msg, ...args) => { if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return; _warn(msg, ...args); }; let embedder = null; async function getEmbedder() { if (!embedder) { embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2"); } return embedder; } async function createEmbedding(text) { const model = await getEmbedder(); const output = await model(text, { pooling: "mean", normalize: true }); return Array.from(output.data); } async function ensureCollection() { try { await qdrant.getCollection(COLLECTION); } catch { await qdrant.createCollection(COLLECTION, { vectors: { size: 384, distance: "Cosine" }, }); console.log(`[QDRANT] Collection '${COLLECTION}' created.`); } } function chunkText(text, chunkSize = 1200, overlap = 250) { const chunks = []; text = text.replace(/\s+/g, " ").trim(); let start = 0; while (start < text.length) { let end = start + chunkSize; if (end < text.length) { const lastPeriod = text.lastIndexOf(".", end); if (lastPeriod > start) end = lastPeriod + 1; } const chunk = text.slice(start, end).trim(); if (chunk.length > 50) chunks.push(chunk); start = end - overlap; } return chunks; } async function extractAndInsert(buffer, fileName) { const { getDocument } = await import("pdfjs-dist/legacy/build/pdf.mjs"); const pdf = await getDocument({ data: new Uint8Array(buffer) }).promise; console.log(`[PDF] ${fileName} — ${pdf.numPages} pages`); await ensureCollection(); const BATCH = 50; let batch = []; let chunkIdx = 0; let total = 0; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { console.log(`[PAGE] ${pageNum}/${pdf.numPages}`); const page = await pdf.getPage(pageNum); const content = await page.getTextContent(); const text = content.items.map(i => i.str).join(" "); if (!text || text.trim().length === 0) continue; const chunks = chunkText(text); for (const chunk of chunks) { const vector = await createEmbedding(chunk); batch.push({ id: Number(`${Date.now()}${chunkIdx}`), vector, payload: { file: fileName, page: pageNum, chunk: chunkIdx, text: chunk, created_at: new Date().toISOString(), }, }); chunkIdx++; if (batch.length >= BATCH) { await qdrant.upsert(COLLECTION, { wait: true, points: batch }); console.log(`[UPSERT] ${batch.length} vectors`); total += batch.length; batch = []; } } } if (batch.length > 0) { await qdrant.upsert(COLLECTION, { wait: true, points: batch }); console.log(`[UPSERT] ${batch.length} vectors (final)`); total += batch.length; } return { fileName, pages: pdf.numPages, vectors: total }; } const upload = async (req, res) => { if (!req.file) { return res.status(400).json({ success: false, error: "No file. Use field name 'pdf'." }); } const t0 = Date.now(); console.log(`[UPLOAD] ${req.file.originalname} (${(req.file.size / 1024).toFixed(1)} KB)`); try { const result = await extractAndInsert(req.file.buffer, req.file.originalname); return res.json({ success: true, ms: Date.now() - t0, ...result }); } catch (err) { console.error("[QDRANT INSERT ERROR]", err.message); return res.status(500).json({ success: false, error: err.message }); } }; module.exports = { upload };