Files
gyanBuddy/index.js
T
2026-06-05 10:25:09 +05:30

723 lines
17 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
require("dotenv").config();
const fs = require("fs");
const path = require("path");
// ======================
// SUPPRESS PDF WARNINGS
// ======================
const originalWarn = console.warn;
console.warn = (
message,
...args
) => {
if (
typeof message ===
"string" &&
(
message.includes(
"UnknownErrorException"
) ||
message.includes(
"TT:"
)
)
) {
return;
}
originalWarn(
message,
...args
);
};
// ======================
// PDF.js
// ======================
const pdfjsLib = require(
"pdfjs-dist/legacy/build/pdf.mjs"
);
// ======================
// Transformers
// ======================
const {
pipeline,
} = require("@xenova/transformers");
// ======================
// Qdrant
// ======================
const {
QdrantClient,
} = require("@qdrant/js-client-rest");
// ======================
// QDRANT CONFIG
// ======================
const qdrant = new QdrantClient({
url: "http://20.40.61.65:6333",
checkCompatibility: false,
timeout: 30000,
});
const COLLECTION_NAME =
"pdf_rag";
let embedder;
// ======================
// LOAD MODEL
// ======================
async function loadModel() {
console.log(
"⏳ Loading embedding model..."
);
embedder = await pipeline(
"feature-extraction",
"Xenova/all-MiniLM-L6-v2"
);
console.log(
"✅ Embedding model loaded"
);
}
// ======================
// SMART CHUNKING
// ======================
function chunkText(
text,
chunkSize = 800,
overlap = 150
) {
const chunks = [];
text = text
.replace(/\s+/g, " ")
.trim();
let start = 0;
while (
start < text.length
) {
let end =
start + chunkSize;
// Try sentence ending
if (
end < text.length
) {
const lastPeriod =
text.lastIndexOf(
".",
end
);
if (
lastPeriod >
start
) {
end =
lastPeriod +
1;
}
}
const chunk = text
.slice(start, end)
.trim();
if (
chunk.length > 50
) {
chunks.push(
chunk
);
}
start =
end - overlap;
}
return chunks;
}
// ======================
// CREATE EMBEDDING
// ======================
async function createEmbedding(
text
) {
const output =
await embedder(text, {
pooling: "mean",
normalize: true,
});
return Array.from(
output.data
);
}
// ======================
// CREATE COLLECTION
// ======================
async function createCollection() {
try {
await qdrant.getCollection(
COLLECTION_NAME
);
console.log(
"️ Collection already exists"
);
} catch (err) {
console.log(
"⏳ Creating collection..."
);
await qdrant.createCollection(
COLLECTION_NAME,
{
vectors: {
size: 384,
distance:
"Cosine",
},
}
);
console.log(
"✅ Collection created"
);
}
}
// ======================
// EXTRACT TEXT FROM PDF
// ======================
async function extractTextFromPDF(
filePath
) {
try {
const dataBuffer =
fs.readFileSync(
filePath
);
const uint8Array =
new Uint8Array(
dataBuffer
);
const loadingTask =
pdfjsLib.getDocument(
{
data: uint8Array,
}
);
const pdf =
await loadingTask.promise;
let fullText = "";
console.log(
`📄 Pages: ${pdf.numPages}`
);
for (
let i = 1;
i <= pdf.numPages;
i++
) {
const page =
await pdf.getPage(i);
const content =
await page.getTextContent();
const pageText =
content.items
.map(
(item) =>
item.str
)
.join(" ");
fullText +=
pageText + "\n";
}
return fullText;
} catch (error) {
console.log(
"❌ PDF extraction error:",
error
);
return "";
}
}
// ======================
// PROCESS PDF
// ======================
async function processPDF(filePath, fileName) {
try {
const dataBuffer = fs.readFileSync(filePath);
const pdf = await pdfjsLib
.getDocument({
data: new Uint8Array(dataBuffer),
})
.promise;
console.log(
`📄 ${fileName} - ${pdf.numPages} pages`
);
const batchSize = 50;
let batchPoints = [];
let globalChunkIndex = 0;
for (
let pageNum = 1;
pageNum <= pdf.numPages;
pageNum++
) {
console.log(
`📖 Processing page ${pageNum}/${pdf.numPages}`
);
const page =
await pdf.getPage(pageNum);
const content =
await page.getTextContent();
const pageText =
content.items
.map((item) => item.str)
.join(" ");
if (
!pageText ||
pageText.trim().length === 0
) {
continue;
}
const chunks = chunkText(
pageText,
1200,
250
);
for (const chunk of chunks) {
const embedding =
await createEmbedding(chunk);
batchPoints.push({
id: Number(
`${Date.now()}${globalChunkIndex}`
),
vector: embedding,
payload: {
file: fileName,
page: pageNum,
chunk: globalChunkIndex,
text: chunk,
created_at:
new Date().toISOString(),
},
});
globalChunkIndex++;
if (
batchPoints.length >= batchSize
) {
console.log(
`⬆️ Uploading ${batchPoints.length} vectors`
);
await qdrant.upsert(
COLLECTION_NAME,
{
wait: true,
points: batchPoints,
}
);
batchPoints = [];
}
}
}
if (batchPoints.length > 0) {
console.log(
`⬆️ Uploading final ${batchPoints.length} vectors`
);
await qdrant.upsert(
COLLECTION_NAME,
{
wait: true,
points: batchPoints,
}
);
}
console.log(
`✅ ${fileName} indexed successfully`
);
} catch (error) {
console.log(
`❌ Error processing ${fileName}:`,
error
);
}
}
// ======================
// MAIN
// ======================
async function main() {
try {
await loadModel();
await createCollection();
const folder =
path.join(
__dirname,
"uploads"
);
if (
!fs.existsSync(
folder
)
) {
console.log(
"❌ uploads folder not found"
);
return;
}
const files =
fs
.readdirSync(
folder
)
.filter((file) =>
file.endsWith(
".pdf"
)
);
if (
files.length === 0
) {
console.log(
"⚠️ No PDFs found"
);
return;
}
console.log(
`📚 Found ${files.length} PDFs`
);
for (const file of files) {
const filePath =
path.join(
folder,
file
);
console.log(
`\n📄 Processing ${file}`
);
await processPDF(
filePath,
file
);
}
console.log(
"\n🎉 All PDFs indexed successfully"
);
} catch (err) {
console.error(
"❌ MAIN ERROR:",
err
);
}
}
main();
// require("dotenv").config();
// const fs = require("fs");
// const path = require("path");
// const crypto = require("crypto");
// // ─── Suppress PDF warnings ────────────────────────────────────────────────────
// const _warn = console.warn;
// console.warn = (msg, ...a) => {
// if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return;
// _warn(msg, ...a);
// };
// const pdfjsLib = require("pdfjs-dist/legacy/build/pdf.mjs");
// const { pipeline } = require("@xenova/transformers");
// const { QdrantClient } = require("@qdrant/js-client-rest");
// // ─── Config ───────────────────────────────────────────────────────────────────
// const QDRANT_URL = process.env.QDRANT_URL || "http://20.40.61.65:6333";
// const COLLECTION_NAME = "pdf_rag";
// const VECTOR_SIZE = 384;
// const CHUNK_SIZE = 1200;
// const CHUNK_OVERLAP = 250;
// const BATCH_SIZE = 100; // points per upsert call
// const EMBED_CONCURRENCY = 8; // parallel embeddings at once
// const MAX_RETRIES = 3;
// const qdrant = new QdrantClient({ url: QDRANT_URL, checkCompatibility: false, timeout: 60000 });
// let embedder;
// // ─── Semaphore ─────────────────────────────────────────────────────────────────
// class Semaphore {
// constructor(n) { this.n = n; this.queue = []; }
// acquire() {
// return new Promise(res => {
// if (this.n > 0) { this.n--; res(); }
// else this.queue.push(res);
// });
// }
// release() {
// if (this.queue.length) this.queue.shift()();
// else this.n++;
// }
// }
// // ─── Retry helper ─────────────────────────────────────────────────────────────
// async function withRetry(fn, retries = MAX_RETRIES, delay = 500) {
// for (let i = 0; i <= retries; i++) {
// try { return await fn(); }
// catch (err) {
// if (i === retries) throw err;
// console.warn(` ⚠️ Retry ${i + 1}/${retries} after error: ${err.message}`);
// await new Promise(r => setTimeout(r, delay * 2 ** i));
// }
// }
// }
// // ─── Deterministic UUID from content hash ────────────────────────────────────
// // Prevents duplicates if you re-run indexing on the same file
// function makePointId(fileName, page, chunkIndex) {
// const hash = crypto
// .createHash("sha256")
// .update(`${fileName}::${page}::${chunkIndex}`)
// .digest("hex");
// // Qdrant supports UUID strings or unsigned ints; use hex slice as UUID-like string
// return `${hash.slice(0,8)}-${hash.slice(8,12)}-${hash.slice(12,16)}-${hash.slice(16,20)}-${hash.slice(20,32)}`;
// }
// // ─── Chunking ─────────────────────────────────────────────────────────────────
// function chunkText(text, size = CHUNK_SIZE, overlap = CHUNK_OVERLAP) {
// const chunks = [];
// text = text.replace(/\s+/g, " ").trim();
// let start = 0;
// while (start < text.length) {
// let end = start + size;
// if (end < text.length) {
// const last = text.lastIndexOf(".", end);
// if (last > start) end = last + 1;
// }
// const chunk = text.slice(start, end).trim();
// if (chunk.length > 50) chunks.push(chunk);
// start = end - overlap;
// }
// return chunks;
// }
// // ─── Embedding ────────────────────────────────────────────────────────────────
// async function embed(text) {
// const out = await embedder(text, { pooling: "mean", normalize: true });
// return Array.from(out.data);
// }
// // Embed multiple texts with bounded parallelism
// async function embedBatch(texts) {
// const sem = new Semaphore(EMBED_CONCURRENCY);
// return Promise.all(
// texts.map(async (text) => {
// await sem.acquire();
// try { return await embed(text); }
// finally { sem.release(); }
// })
// );
// }
// // ─── Qdrant helpers ───────────────────────────────────────────────────────────
// async function ensureCollection() {
// try {
// await qdrant.getCollection(COLLECTION_NAME);
// console.log("️ Collection already exists");
// } catch {
// console.log("⏳ Creating collection...");
// await qdrant.createCollection(COLLECTION_NAME, {
// vectors: { size: VECTOR_SIZE, distance: "Cosine" },
// // Optimizers: tune for bulk ingest speed, re-enable indexing after
// optimizers_config: { indexing_threshold: 0 },
// });
// console.log("✅ Collection created");
// }
// }
// // Upload a batch with retry
// async function upsertBatch(points) {
// await withRetry(() =>
// qdrant.upsert(COLLECTION_NAME, { wait: true, points })
// );
// }
// // After bulk ingest, re-enable HNSW indexing
// async function enableIndexing() {
// await qdrant.updateCollection(COLLECTION_NAME, {
// optimizers_config: { indexing_threshold: 20000 },
// });
// console.log("🔧 HNSW indexing re-enabled");
// }
// // ─── Check if file already indexed ───────────────────────────────────────────
// async function isFileIndexed(fileName) {
// try {
// const result = await qdrant.scroll(COLLECTION_NAME, {
// filter: { must: [{ key: "file", match: { value: fileName } }] },
// limit: 1,
// with_payload: false,
// with_vector: false,
// });
// return result.points.length > 0;
// } catch { return false; }
// }
// // ─── Process a single PDF ─────────────────────────────────────────────────────
// async function processPDF(filePath, fileName) {
// console.log(`\n📄 ${fileName}`);
// if (await isFileIndexed(fileName)) {
// console.log(` ⏭️ Already indexed — skipping`);
// return;
// }
// const pdf = await pdfjsLib
// .getDocument({ data: new Uint8Array(fs.readFileSync(filePath)) })
// .promise;
// console.log(` 📖 ${pdf.numPages} pages`);
// const allChunks = []; // { text, page, chunkIndex }
// // 1️⃣ Extract all text first (fast, sequential is fine for I/O)
// for (let p = 1; p <= pdf.numPages; p++) {
// const page = await pdf.getPage(p);
// const content = await page.getTextContent();
// const text = content.items.map(i => i.str).join(" ");
// if (!text.trim()) continue;
// const chunks = chunkText(text);
// chunks.forEach((chunk, ci) => allChunks.push({ text: chunk, page: p, chunkIndex: allChunks.length }));
// }
// console.log(` 🧩 ${allChunks.length} chunks — embedding with concurrency=${EMBED_CONCURRENCY}`);
// // 2️⃣ Embed all chunks in parallel (bounded by semaphore)
// const start = Date.now();
// const vectors = await embedBatch(allChunks.map(c => c.text));
// const elapsed = ((Date.now() - start) / 1000).toFixed(1);
// console.log(` ⚡ Embedding done in ${elapsed}s`);
// // 3️⃣ Build points
// const points = allChunks.map((c, i) => ({
// id: makePointId(fileName, c.page, c.chunkIndex),
// vector: vectors[i],
// payload: {
// file: fileName,
// page: c.page,
// chunk: c.chunkIndex,
// text: c.text,
// created_at: new Date().toISOString(),
// },
// }));
// // 4️⃣ Batch upsert with progress
// let uploaded = 0;
// for (let i = 0; i < points.length; i += BATCH_SIZE) {
// const batch = points.slice(i, i + BATCH_SIZE);
// await upsertBatch(batch);
// uploaded += batch.length;
// process.stdout.write(`\r ⬆️ ${uploaded}/${points.length} vectors uploaded`);
// }
// console.log(`\n ✅ ${fileName} indexed`);
// }
// // ─── Main ─────────────────────────────────────────────────────────────────────
// async function main() {
// console.log("⏳ Loading embedding model...");
// embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
// console.log("✅ Model loaded\n");
// await ensureCollection();
// const folder = path.join(__dirname, "uploads");
// if (!fs.existsSync(folder)) return console.log("❌ uploads/ folder not found");
// const pdfs = fs.readdirSync(folder).filter(f => f.endsWith(".pdf"));
// if (!pdfs.length) return console.log("⚠️ No PDFs found");
// console.log(`📚 Found ${pdfs.length} PDF(s)\n`);
// const t0 = Date.now();
// for (const file of pdfs) {
// await processPDF(path.join(folder, file), file);
// }
// await enableIndexing(); // re-enable HNSW after bulk load
// console.log(`\n🎉 Done in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
// }
// main().catch(err => { console.error("❌ Fatal:", err); process.exit(1); });