first commit

2026-06-05 10:25:09 +05:30
commit 0b5715f4b0
10 changed files with 4276 additions and 0 deletions
@@ -0,0 +1,723 @@
+require("dotenv").config();
+
+const fs = require("fs");
+const path = require("path");
+
+// ======================
+// SUPPRESS PDF WARNINGS
+// ======================
+const originalWarn = console.warn;
+
+console.warn = (
+  message,
+  ...args
+) => {
+  if (
+    typeof message ===
+    "string" &&
+    (
+      message.includes(
+        "UnknownErrorException"
+      ) ||
+      message.includes(
+        "TT:"
+      )
+    )
+  ) {
+    return;
+  }
+
+  originalWarn(
+    message,
+    ...args
+  );
+};
+
+// ======================
+// PDF.js
+// ======================
+const pdfjsLib = require(
+  "pdfjs-dist/legacy/build/pdf.mjs"
+);
+
+// ======================
+// Transformers
+// ======================
+const {
+  pipeline,
+} = require("@xenova/transformers");
+
+// ======================
+// Qdrant
+// ======================
+const {
+  QdrantClient,
+} = require("@qdrant/js-client-rest");
+
+// ======================
+// QDRANT CONFIG
+// ======================
+const qdrant = new QdrantClient({
+  url: "http://20.40.61.65:6333",
+  checkCompatibility: false,
+  timeout: 30000,
+});
+
+const COLLECTION_NAME =
+  "pdf_rag";
+
+let embedder;
+
+// ======================
+// LOAD MODEL
+// ======================
+async function loadModel() {
+  console.log(
+    "⏳ Loading embedding model..."
+  );
+
+  embedder = await pipeline(
+    "feature-extraction",
+    "Xenova/all-MiniLM-L6-v2"
+  );
+
+  console.log(
+    "✅ Embedding model loaded"
+  );
+}
+
+// ======================
+// SMART CHUNKING
+// ======================
+function chunkText(
+  text,
+  chunkSize = 800,
+  overlap = 150
+) {
+  const chunks = [];
+
+  text = text
+    .replace(/\s+/g, " ")
+    .trim();
+
+  let start = 0;
+
+  while (
+    start < text.length
+  ) {
+    let end =
+      start + chunkSize;
+
+    // Try sentence ending
+    if (
+      end < text.length
+    ) {
+      const lastPeriod =
+        text.lastIndexOf(
+          ".",
+          end
+        );
+
+      if (
+        lastPeriod >
+        start
+      ) {
+        end =
+          lastPeriod +
+          1;
+      }
+    }
+
+    const chunk = text
+      .slice(start, end)
+      .trim();
+
+    if (
+      chunk.length > 50
+    ) {
+      chunks.push(
+        chunk
+      );
+    }
+
+    start =
+      end - overlap;
+  }
+
+  return chunks;
+}
+
+// ======================
+// CREATE EMBEDDING
+// ======================
+async function createEmbedding(
+  text
+) {
+  const output =
+    await embedder(text, {
+      pooling: "mean",
+
+      normalize: true,
+    });
+
+  return Array.from(
+    output.data
+  );
+}
+
+// ======================
+// CREATE COLLECTION
+// ======================
+async function createCollection() {
+  try {
+    await qdrant.getCollection(
+      COLLECTION_NAME
+    );
+
+    console.log(
+      "ℹ️ Collection already exists"
+    );
+  } catch (err) {
+    console.log(
+      "⏳ Creating collection..."
+    );
+
+    await qdrant.createCollection(
+      COLLECTION_NAME,
+      {
+        vectors: {
+          size: 384,
+
+          distance:
+            "Cosine",
+        },
+      }
+    );
+
+    console.log(
+      "✅ Collection created"
+    );
+  }
+}
+
+// ======================
+// EXTRACT TEXT FROM PDF
+// ======================
+async function extractTextFromPDF(
+  filePath
+) {
+  try {
+    const dataBuffer =
+      fs.readFileSync(
+        filePath
+      );
+
+    const uint8Array =
+      new Uint8Array(
+        dataBuffer
+      );
+
+    const loadingTask =
+      pdfjsLib.getDocument(
+        {
+          data: uint8Array,
+        }
+      );
+
+    const pdf =
+      await loadingTask.promise;
+
+    let fullText = "";
+
+    console.log(
+      `📄 Pages: ${pdf.numPages}`
+    );
+
+    for (
+      let i = 1;
+      i <= pdf.numPages;
+      i++
+    ) {
+      const page =
+        await pdf.getPage(i);
+
+      const content =
+        await page.getTextContent();
+
+      const pageText =
+        content.items
+          .map(
+            (item) =>
+              item.str
+          )
+          .join(" ");
+
+      fullText +=
+        pageText + "\n";
+    }
+
+    return fullText;
+  } catch (error) {
+    console.log(
+      "❌ PDF extraction error:",
+      error
+    );
+
+    return "";
+  }
+}
+
+// ======================
+// PROCESS PDF
+// ======================
+async function processPDF(filePath, fileName) {
+  try {
+    const dataBuffer = fs.readFileSync(filePath);
+
+    const pdf = await pdfjsLib
+      .getDocument({
+        data: new Uint8Array(dataBuffer),
+      })
+      .promise;
+
+    console.log(
+      `📄 ${fileName} - ${pdf.numPages} pages`
+    );
+
+    const batchSize = 50;
+    let batchPoints = [];
+    let globalChunkIndex = 0;
+
+    for (
+      let pageNum = 1;
+      pageNum <= pdf.numPages;
+      pageNum++
+    ) {
+
+  
+      
+      console.log(
+        `📖 Processing page ${pageNum}/${pdf.numPages}`
+      );
+
+      const page =
+        await pdf.getPage(pageNum);
+
+      const content =
+        await page.getTextContent();
+
+      const pageText =
+        content.items
+          .map((item) => item.str)
+          .join(" ");
+
+      if (
+        !pageText ||
+        pageText.trim().length === 0
+      ) {
+        continue;
+      }
+
+      const chunks = chunkText(
+        pageText,
+        1200,
+        250
+      );
+
+      for (const chunk of chunks) {
+        const embedding =
+          await createEmbedding(chunk);
+
+        batchPoints.push({
+          id: Number(
+            `${Date.now()}${globalChunkIndex}`
+          ),
+
+          vector: embedding,
+
+          payload: {
+            file: fileName,
+            page: pageNum,
+            chunk: globalChunkIndex,
+            text: chunk,
+            created_at:
+              new Date().toISOString(),
+          },
+        });
+
+        globalChunkIndex++;
+
+        if (
+          batchPoints.length >= batchSize
+        ) {
+          console.log(
+            `⬆️ Uploading ${batchPoints.length} vectors`
+          );
+
+          await qdrant.upsert(
+            COLLECTION_NAME,
+            {
+              wait: true,
+              points: batchPoints,
+            }
+          );
+
+          batchPoints = [];
+        }
+      }
+    }
+
+    if (batchPoints.length > 0) {
+      console.log(
+        `⬆️ Uploading final ${batchPoints.length} vectors`
+      );
+
+      await qdrant.upsert(
+        COLLECTION_NAME,
+        {
+          wait: true,
+          points: batchPoints,
+        }
+      );
+    }
+
+    console.log(
+      `✅ ${fileName} indexed successfully`
+    );
+  } catch (error) {
+    console.log(
+      `❌ Error processing ${fileName}:`,
+      error
+    );
+  }
+}
+
+// ======================
+// MAIN
+// ======================
+async function main() {
+  try {
+    await loadModel();
+
+    await createCollection();
+
+    const folder =
+      path.join(
+        __dirname,
+        "uploads"
+      );
+
+    if (
+      !fs.existsSync(
+        folder
+      )
+    ) {
+      console.log(
+        "❌ uploads folder not found"
+      );
+
+      return;
+    }
+
+    const files =
+      fs
+        .readdirSync(
+          folder
+        )
+        .filter((file) =>
+          file.endsWith(
+            ".pdf"
+          )
+        );
+
+    if (
+      files.length === 0
+    ) {
+      console.log(
+        "⚠️ No PDFs found"
+      );
+
+      return;
+    }
+
+    console.log(
+      `📚 Found ${files.length} PDFs`
+    );
+
+    for (const file of files) {
+      const filePath =
+        path.join(
+          folder,
+          file
+        );
+
+      console.log(
+        `\n📄 Processing ${file}`
+      );
+
+      await processPDF(
+        filePath,
+        file
+      );
+    }
+
+    console.log(
+      "\n🎉 All PDFs indexed successfully"
+    );
+  } catch (err) {
+    console.error(
+      "❌ MAIN ERROR:",
+      err
+    );
+  }
+}
+
+main();
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// require("dotenv").config();
+// const fs = require("fs");
+// const path = require("path");
+// const crypto = require("crypto");
+
+// // ─── Suppress PDF warnings ────────────────────────────────────────────────────
+// const _warn = console.warn;
+// console.warn = (msg, ...a) => {
+//   if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return;
+//   _warn(msg, ...a);
+// };
+
+// const pdfjsLib = require("pdfjs-dist/legacy/build/pdf.mjs");
+// const { pipeline } = require("@xenova/transformers");
+// const { QdrantClient } = require("@qdrant/js-client-rest");
+
+// // ─── Config ───────────────────────────────────────────────────────────────────
+// const QDRANT_URL       = process.env.QDRANT_URL || "http://20.40.61.65:6333";
+// const COLLECTION_NAME  = "pdf_rag";
+// const VECTOR_SIZE      = 384;
+// const CHUNK_SIZE       = 1200;
+// const CHUNK_OVERLAP    = 250;
+// const BATCH_SIZE       = 100;       // points per upsert call
+// const EMBED_CONCURRENCY = 8;        // parallel embeddings at once
+// const MAX_RETRIES      = 3;
+
+// const qdrant = new QdrantClient({ url: QDRANT_URL, checkCompatibility: false, timeout: 60000 });
+// let embedder;
+
+// // ─── Semaphore ─────────────────────────────────────────────────────────────────
+// class Semaphore {
+//   constructor(n) { this.n = n; this.queue = []; }
+//   acquire() {
+//     return new Promise(res => {
+//       if (this.n > 0) { this.n--; res(); }
+//       else this.queue.push(res);
+//     });
+//   }
+//   release() {
+//     if (this.queue.length) this.queue.shift()();
+//     else this.n++;
+//   }
+// }
+
+// // ─── Retry helper ─────────────────────────────────────────────────────────────
+// async function withRetry(fn, retries = MAX_RETRIES, delay = 500) {
+//   for (let i = 0; i <= retries; i++) {
+//     try { return await fn(); }
+//     catch (err) {
+//       if (i === retries) throw err;
+//       console.warn(`  ⚠️  Retry ${i + 1}/${retries} after error: ${err.message}`);
+//       await new Promise(r => setTimeout(r, delay * 2 ** i));
+//     }
+//   }
+// }
+
+// // ─── Deterministic UUID from content hash ────────────────────────────────────
+// // Prevents duplicates if you re-run indexing on the same file
+// function makePointId(fileName, page, chunkIndex) {
+//   const hash = crypto
+//     .createHash("sha256")
+//     .update(`${fileName}::${page}::${chunkIndex}`)
+//     .digest("hex");
+//   // Qdrant supports UUID strings or unsigned ints; use hex slice as UUID-like string
+//   return `${hash.slice(0,8)}-${hash.slice(8,12)}-${hash.slice(12,16)}-${hash.slice(16,20)}-${hash.slice(20,32)}`;
+// }
+
+// // ─── Chunking ─────────────────────────────────────────────────────────────────
+// function chunkText(text, size = CHUNK_SIZE, overlap = CHUNK_OVERLAP) {
+//   const chunks = [];
+//   text = text.replace(/\s+/g, " ").trim();
+//   let start = 0;
+//   while (start < text.length) {
+//     let end = start + size;
+//     if (end < text.length) {
+//       const last = text.lastIndexOf(".", end);
+//       if (last > start) end = last + 1;
+//     }
+//     const chunk = text.slice(start, end).trim();
+//     if (chunk.length > 50) chunks.push(chunk);
+//     start = end - overlap;
+//   }
+//   return chunks;
+// }
+
+// // ─── Embedding ────────────────────────────────────────────────────────────────
+// async function embed(text) {
+//   const out = await embedder(text, { pooling: "mean", normalize: true });
+//   return Array.from(out.data);
+// }
+
+// // Embed multiple texts with bounded parallelism
+// async function embedBatch(texts) {
+//   const sem = new Semaphore(EMBED_CONCURRENCY);
+//   return Promise.all(
+//     texts.map(async (text) => {
+//       await sem.acquire();
+//       try { return await embed(text); }
+//       finally { sem.release(); }
+//     })
+//   );
+// }
+
+// // ─── Qdrant helpers ───────────────────────────────────────────────────────────
+// async function ensureCollection() {
+//   try {
+//     await qdrant.getCollection(COLLECTION_NAME);
+//     console.log("ℹ️  Collection already exists");
+//   } catch {
+//     console.log("⏳ Creating collection...");
+//     await qdrant.createCollection(COLLECTION_NAME, {
+//       vectors: { size: VECTOR_SIZE, distance: "Cosine" },
+//       // Optimizers: tune for bulk ingest speed, re-enable indexing after
+//       optimizers_config: { indexing_threshold: 0 },
+//     });
+//     console.log("✅ Collection created");
+//   }
+// }
+
+// // Upload a batch with retry
+// async function upsertBatch(points) {
+//   await withRetry(() =>
+//     qdrant.upsert(COLLECTION_NAME, { wait: true, points })
+//   );
+// }
+
+// // After bulk ingest, re-enable HNSW indexing
+// async function enableIndexing() {
+//   await qdrant.updateCollection(COLLECTION_NAME, {
+//     optimizers_config: { indexing_threshold: 20000 },
+//   });
+//   console.log("🔧 HNSW indexing re-enabled");
+// }
+
+// // ─── Check if file already indexed ───────────────────────────────────────────
+// async function isFileIndexed(fileName) {
+//   try {
+//     const result = await qdrant.scroll(COLLECTION_NAME, {
+//       filter: { must: [{ key: "file", match: { value: fileName } }] },
+//       limit: 1,
+//       with_payload: false,
+//       with_vector: false,
+//     });
+//     return result.points.length > 0;
+//   } catch { return false; }
+// }
+
+// // ─── Process a single PDF ─────────────────────────────────────────────────────
+// async function processPDF(filePath, fileName) {
+//   console.log(`\n📄 ${fileName}`);
+
+//   if (await isFileIndexed(fileName)) {
+//     console.log(`  ⏭️  Already indexed — skipping`);
+//     return;
+//   }
+
+//   const pdf = await pdfjsLib
+//     .getDocument({ data: new Uint8Array(fs.readFileSync(filePath)) })
+//     .promise;
+
+//   console.log(`  📖 ${pdf.numPages} pages`);
+
+//   const allChunks = []; // { text, page, chunkIndex }
+
+//   // 1️⃣  Extract all text first (fast, sequential is fine for I/O)
+//   for (let p = 1; p <= pdf.numPages; p++) {
+//     const page    = await pdf.getPage(p);
+//     const content = await page.getTextContent();
+//     const text    = content.items.map(i => i.str).join(" ");
+//     if (!text.trim()) continue;
+//     const chunks = chunkText(text);
+//     chunks.forEach((chunk, ci) => allChunks.push({ text: chunk, page: p, chunkIndex: allChunks.length }));
+//   }
+
+//   console.log(`  🧩 ${allChunks.length} chunks — embedding with concurrency=${EMBED_CONCURRENCY}`);
+
+//   // 2️⃣  Embed all chunks in parallel (bounded by semaphore)
+//   const start = Date.now();
+//   const vectors = await embedBatch(allChunks.map(c => c.text));
+//   const elapsed = ((Date.now() - start) / 1000).toFixed(1);
+//   console.log(`  ⚡ Embedding done in ${elapsed}s`);
+
+//   // 3️⃣  Build points
+//   const points = allChunks.map((c, i) => ({
+//     id:      makePointId(fileName, c.page, c.chunkIndex),
+//     vector:  vectors[i],
+//     payload: {
+//       file:       fileName,
+//       page:       c.page,
+//       chunk:      c.chunkIndex,
+//       text:       c.text,
+//       created_at: new Date().toISOString(),
+//     },
+//   }));
+
+//   // 4️⃣  Batch upsert with progress
+//   let uploaded = 0;
+//   for (let i = 0; i < points.length; i += BATCH_SIZE) {
+//     const batch = points.slice(i, i + BATCH_SIZE);
+//     await upsertBatch(batch);
+//     uploaded += batch.length;
+//     process.stdout.write(`\r  ⬆️  ${uploaded}/${points.length} vectors uploaded`);
+//   }
+//   console.log(`\n  ✅ ${fileName} indexed`);
+// }
+
+// // ─── Main ─────────────────────────────────────────────────────────────────────
+// async function main() {
+//   console.log("⏳ Loading embedding model...");
+//   embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
+//   console.log("✅ Model loaded\n");
+
+//   await ensureCollection();
+
+//   const folder = path.join(__dirname, "uploads");
+//   if (!fs.existsSync(folder)) return console.log("❌ uploads/ folder not found");
+
+//   const pdfs = fs.readdirSync(folder).filter(f => f.endsWith(".pdf"));
+//   if (!pdfs.length) return console.log("⚠️  No PDFs found");
+
+//   console.log(`📚 Found ${pdfs.length} PDF(s)\n`);
+
+//   const t0 = Date.now();
+//   for (const file of pdfs) {
+//     await processPDF(path.join(folder, file), file);
+//   }
+
+//   await enableIndexing(); // re-enable HNSW after bulk load
+
+//   console.log(`\n🎉 Done in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
+// }
+
+// main().catch(err => { console.error("❌ Fatal:", err); process.exit(1); });