Files
gyanBuddy/controller/qdrantinsert.js
T
Gitea 41d82f9266
Deploy Node App / deploy (push) Successful in 25s
structure ready
2026-06-15 15:43:09 +05:30

142 lines
3.8 KiB
JavaScript

const { QdrantClient } = require("@qdrant/js-client-rest");
const { pipeline } = require("@xenova/transformers");
const COLLECTION = process.env.QDRANT_COLLECTION
const qdrant = new QdrantClient({
url: process.env.QDRANT_URL,
checkCompatibility: false,
timeout: 30000,
});
const _warn = console.warn;
console.warn = (msg, ...args) => {
if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return;
_warn(msg, ...args);
};
let embedder = null;
async function getEmbedder() {
if (!embedder) {
embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
}
return embedder;
}
async function createEmbedding(text) {
const model = await getEmbedder();
const output = await model(text, { pooling: "mean", normalize: true });
return Array.from(output.data);
}
async function ensureCollection() {
try {
await qdrant.getCollection(COLLECTION);
} catch {
await qdrant.createCollection(COLLECTION, {
vectors: { size: 384, distance: "Cosine" },
});
console.log(`[QDRANT] Collection '${COLLECTION}' created.`);
}
}
function chunkText(text, chunkSize = 1200, overlap = 250) {
const chunks = [];
text = text.replace(/\s+/g, " ").trim();
let start = 0;
while (start < text.length) {
let end = start + chunkSize;
if (end < text.length) {
const lastPeriod = text.lastIndexOf(".", end);
if (lastPeriod > start) end = lastPeriod + 1;
}
const chunk = text.slice(start, end).trim();
if (chunk.length > 50) chunks.push(chunk);
start = end - overlap;
}
return chunks;
}
async function extractAndInsert(buffer, fileName) {
const { getDocument } = await import("pdfjs-dist/legacy/build/pdf.mjs");
const pdf = await getDocument({ data: new Uint8Array(buffer) }).promise;
console.log(`[PDF] ${fileName}${pdf.numPages} pages`);
await ensureCollection();
const BATCH = 50;
let batch = [];
let chunkIdx = 0;
let total = 0;
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
console.log(`[PAGE] ${pageNum}/${pdf.numPages}`);
const page = await pdf.getPage(pageNum);
const content = await page.getTextContent();
const text = content.items.map(i => i.str).join(" ");
if (!text || text.trim().length === 0) continue;
const chunks = chunkText(text);
for (const chunk of chunks) {
const vector = await createEmbedding(chunk);
batch.push({
id: Number(`${Date.now()}${chunkIdx}`),
vector,
payload: {
file: fileName,
page: pageNum,
chunk: chunkIdx,
text: chunk,
created_at: new Date().toISOString(),
},
});
chunkIdx++;
if (batch.length >= BATCH) {
await qdrant.upsert(COLLECTION, { wait: true, points: batch });
console.log(`[UPSERT] ${batch.length} vectors`);
total += batch.length;
batch = [];
}
}
}
if (batch.length > 0) {
await qdrant.upsert(COLLECTION, { wait: true, points: batch });
console.log(`[UPSERT] ${batch.length} vectors (final)`);
total += batch.length;
}
return { fileName, pages: pdf.numPages, vectors: total };
}
const upload = async (req, res) => {
if (!req.file) {
return res.status(400).json({ success: false, error: "No file. Use field name 'pdf'." });
}
const t0 = Date.now();
console.log(`[UPLOAD] ${req.file.originalname} (${(req.file.size / 1024).toFixed(1)} KB)`);
try {
const result = await extractAndInsert(req.file.buffer, req.file.originalname);
return res.json({ success: true, ms: Date.now() - t0, ...result });
} catch (err) {
console.error("[QDRANT INSERT ERROR]", err.message);
return res.status(500).json({ success: false, error: err.message });
}
};
module.exports = { upload };