142 lines
3.8 KiB
JavaScript
142 lines
3.8 KiB
JavaScript
const { QdrantClient } = require("@qdrant/js-client-rest");
|
|
const { pipeline } = require("@xenova/transformers");
|
|
|
|
const COLLECTION = process.env.QDRANT_COLLECTION
|
|
const qdrant = new QdrantClient({
|
|
url: process.env.QDRANT_URL,
|
|
checkCompatibility: false,
|
|
timeout: 30000,
|
|
});
|
|
|
|
|
|
const _warn = console.warn;
|
|
console.warn = (msg, ...args) => {
|
|
if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return;
|
|
_warn(msg, ...args);
|
|
};
|
|
|
|
|
|
let embedder = null;
|
|
async function getEmbedder() {
|
|
if (!embedder) {
|
|
embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
|
|
}
|
|
return embedder;
|
|
}
|
|
|
|
async function createEmbedding(text) {
|
|
const model = await getEmbedder();
|
|
const output = await model(text, { pooling: "mean", normalize: true });
|
|
return Array.from(output.data);
|
|
}
|
|
|
|
|
|
async function ensureCollection() {
|
|
try {
|
|
await qdrant.getCollection(COLLECTION);
|
|
} catch {
|
|
await qdrant.createCollection(COLLECTION, {
|
|
vectors: { size: 384, distance: "Cosine" },
|
|
});
|
|
console.log(`[QDRANT] Collection '${COLLECTION}' created.`);
|
|
}
|
|
}
|
|
|
|
|
|
function chunkText(text, chunkSize = 1200, overlap = 250) {
|
|
const chunks = [];
|
|
text = text.replace(/\s+/g, " ").trim();
|
|
let start = 0;
|
|
while (start < text.length) {
|
|
let end = start + chunkSize;
|
|
if (end < text.length) {
|
|
const lastPeriod = text.lastIndexOf(".", end);
|
|
if (lastPeriod > start) end = lastPeriod + 1;
|
|
}
|
|
const chunk = text.slice(start, end).trim();
|
|
if (chunk.length > 50) chunks.push(chunk);
|
|
start = end - overlap;
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
|
|
async function extractAndInsert(buffer, fileName) {
|
|
const { getDocument } = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
|
|
const pdf = await getDocument({ data: new Uint8Array(buffer) }).promise;
|
|
console.log(`[PDF] ${fileName} — ${pdf.numPages} pages`);
|
|
|
|
await ensureCollection();
|
|
|
|
const BATCH = 50;
|
|
let batch = [];
|
|
let chunkIdx = 0;
|
|
let total = 0;
|
|
|
|
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
|
|
console.log(`[PAGE] ${pageNum}/${pdf.numPages}`);
|
|
|
|
const page = await pdf.getPage(pageNum);
|
|
const content = await page.getTextContent();
|
|
const text = content.items.map(i => i.str).join(" ");
|
|
|
|
if (!text || text.trim().length === 0) continue;
|
|
|
|
const chunks = chunkText(text);
|
|
|
|
for (const chunk of chunks) {
|
|
const vector = await createEmbedding(chunk);
|
|
|
|
batch.push({
|
|
id: Number(`${Date.now()}${chunkIdx}`),
|
|
vector,
|
|
payload: {
|
|
file: fileName,
|
|
page: pageNum,
|
|
chunk: chunkIdx,
|
|
text: chunk,
|
|
created_at: new Date().toISOString(),
|
|
},
|
|
});
|
|
|
|
chunkIdx++;
|
|
|
|
if (batch.length >= BATCH) {
|
|
await qdrant.upsert(COLLECTION, { wait: true, points: batch });
|
|
console.log(`[UPSERT] ${batch.length} vectors`);
|
|
total += batch.length;
|
|
batch = [];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (batch.length > 0) {
|
|
await qdrant.upsert(COLLECTION, { wait: true, points: batch });
|
|
console.log(`[UPSERT] ${batch.length} vectors (final)`);
|
|
total += batch.length;
|
|
}
|
|
|
|
return { fileName, pages: pdf.numPages, vectors: total };
|
|
}
|
|
|
|
|
|
const upload = async (req, res) => {
|
|
if (!req.file) {
|
|
return res.status(400).json({ success: false, error: "No file. Use field name 'pdf'." });
|
|
}
|
|
|
|
const t0 = Date.now();
|
|
console.log(`[UPLOAD] ${req.file.originalname} (${(req.file.size / 1024).toFixed(1)} KB)`);
|
|
|
|
try {
|
|
const result = await extractAndInsert(req.file.buffer, req.file.originalname);
|
|
return res.json({ success: true, ms: Date.now() - t0, ...result });
|
|
} catch (err) {
|
|
console.error("[QDRANT INSERT ERROR]", err.message);
|
|
return res.status(500).json({ success: false, error: err.message });
|
|
}
|
|
};
|
|
|
|
module.exports = { upload }; |