723 lines
17 KiB
JavaScript
723 lines
17 KiB
JavaScript
require("dotenv").config();
|
||
|
||
const fs = require("fs");
|
||
const path = require("path");
|
||
|
||
// ======================
|
||
// SUPPRESS PDF WARNINGS
|
||
// ======================
|
||
const originalWarn = console.warn;
|
||
|
||
console.warn = (
|
||
message,
|
||
...args
|
||
) => {
|
||
if (
|
||
typeof message ===
|
||
"string" &&
|
||
(
|
||
message.includes(
|
||
"UnknownErrorException"
|
||
) ||
|
||
message.includes(
|
||
"TT:"
|
||
)
|
||
)
|
||
) {
|
||
return;
|
||
}
|
||
|
||
originalWarn(
|
||
message,
|
||
...args
|
||
);
|
||
};
|
||
|
||
// ======================
|
||
// PDF.js
|
||
// ======================
|
||
const pdfjsLib = require(
|
||
"pdfjs-dist/legacy/build/pdf.mjs"
|
||
);
|
||
|
||
// ======================
|
||
// Transformers
|
||
// ======================
|
||
const {
|
||
pipeline,
|
||
} = require("@xenova/transformers");
|
||
|
||
// ======================
|
||
// Qdrant
|
||
// ======================
|
||
const {
|
||
QdrantClient,
|
||
} = require("@qdrant/js-client-rest");
|
||
|
||
// ======================
|
||
// QDRANT CONFIG
|
||
// ======================
|
||
const qdrant = new QdrantClient({
|
||
url: "http://20.40.61.65:6333",
|
||
checkCompatibility: false,
|
||
timeout: 30000,
|
||
});
|
||
|
||
const COLLECTION_NAME =
|
||
"pdf_rag";
|
||
|
||
let embedder;
|
||
|
||
// ======================
|
||
// LOAD MODEL
|
||
// ======================
|
||
async function loadModel() {
|
||
console.log(
|
||
"⏳ Loading embedding model..."
|
||
);
|
||
|
||
embedder = await pipeline(
|
||
"feature-extraction",
|
||
"Xenova/all-MiniLM-L6-v2"
|
||
);
|
||
|
||
console.log(
|
||
"✅ Embedding model loaded"
|
||
);
|
||
}
|
||
|
||
// ======================
|
||
// SMART CHUNKING
|
||
// ======================
|
||
function chunkText(
|
||
text,
|
||
chunkSize = 800,
|
||
overlap = 150
|
||
) {
|
||
const chunks = [];
|
||
|
||
text = text
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
|
||
let start = 0;
|
||
|
||
while (
|
||
start < text.length
|
||
) {
|
||
let end =
|
||
start + chunkSize;
|
||
|
||
// Try sentence ending
|
||
if (
|
||
end < text.length
|
||
) {
|
||
const lastPeriod =
|
||
text.lastIndexOf(
|
||
".",
|
||
end
|
||
);
|
||
|
||
if (
|
||
lastPeriod >
|
||
start
|
||
) {
|
||
end =
|
||
lastPeriod +
|
||
1;
|
||
}
|
||
}
|
||
|
||
const chunk = text
|
||
.slice(start, end)
|
||
.trim();
|
||
|
||
if (
|
||
chunk.length > 50
|
||
) {
|
||
chunks.push(
|
||
chunk
|
||
);
|
||
}
|
||
|
||
start =
|
||
end - overlap;
|
||
}
|
||
|
||
return chunks;
|
||
}
|
||
|
||
// ======================
|
||
// CREATE EMBEDDING
|
||
// ======================
|
||
async function createEmbedding(
|
||
text
|
||
) {
|
||
const output =
|
||
await embedder(text, {
|
||
pooling: "mean",
|
||
|
||
normalize: true,
|
||
});
|
||
|
||
return Array.from(
|
||
output.data
|
||
);
|
||
}
|
||
|
||
// ======================
|
||
// CREATE COLLECTION
|
||
// ======================
|
||
async function createCollection() {
|
||
try {
|
||
await qdrant.getCollection(
|
||
COLLECTION_NAME
|
||
);
|
||
|
||
console.log(
|
||
"ℹ️ Collection already exists"
|
||
);
|
||
} catch (err) {
|
||
console.log(
|
||
"⏳ Creating collection..."
|
||
);
|
||
|
||
await qdrant.createCollection(
|
||
COLLECTION_NAME,
|
||
{
|
||
vectors: {
|
||
size: 384,
|
||
|
||
distance:
|
||
"Cosine",
|
||
},
|
||
}
|
||
);
|
||
|
||
console.log(
|
||
"✅ Collection created"
|
||
);
|
||
}
|
||
}
|
||
|
||
// ======================
|
||
// EXTRACT TEXT FROM PDF
|
||
// ======================
|
||
async function extractTextFromPDF(
|
||
filePath
|
||
) {
|
||
try {
|
||
const dataBuffer =
|
||
fs.readFileSync(
|
||
filePath
|
||
);
|
||
|
||
const uint8Array =
|
||
new Uint8Array(
|
||
dataBuffer
|
||
);
|
||
|
||
const loadingTask =
|
||
pdfjsLib.getDocument(
|
||
{
|
||
data: uint8Array,
|
||
}
|
||
);
|
||
|
||
const pdf =
|
||
await loadingTask.promise;
|
||
|
||
let fullText = "";
|
||
|
||
console.log(
|
||
`📄 Pages: ${pdf.numPages}`
|
||
);
|
||
|
||
for (
|
||
let i = 1;
|
||
i <= pdf.numPages;
|
||
i++
|
||
) {
|
||
const page =
|
||
await pdf.getPage(i);
|
||
|
||
const content =
|
||
await page.getTextContent();
|
||
|
||
const pageText =
|
||
content.items
|
||
.map(
|
||
(item) =>
|
||
item.str
|
||
)
|
||
.join(" ");
|
||
|
||
fullText +=
|
||
pageText + "\n";
|
||
}
|
||
|
||
return fullText;
|
||
} catch (error) {
|
||
console.log(
|
||
"❌ PDF extraction error:",
|
||
error
|
||
);
|
||
|
||
return "";
|
||
}
|
||
}
|
||
|
||
// ======================
|
||
// PROCESS PDF
|
||
// ======================
|
||
async function processPDF(filePath, fileName) {
|
||
try {
|
||
const dataBuffer = fs.readFileSync(filePath);
|
||
|
||
const pdf = await pdfjsLib
|
||
.getDocument({
|
||
data: new Uint8Array(dataBuffer),
|
||
})
|
||
.promise;
|
||
|
||
console.log(
|
||
`📄 ${fileName} - ${pdf.numPages} pages`
|
||
);
|
||
|
||
const batchSize = 50;
|
||
let batchPoints = [];
|
||
let globalChunkIndex = 0;
|
||
|
||
for (
|
||
let pageNum = 1;
|
||
pageNum <= pdf.numPages;
|
||
pageNum++
|
||
) {
|
||
|
||
|
||
|
||
console.log(
|
||
`📖 Processing page ${pageNum}/${pdf.numPages}`
|
||
);
|
||
|
||
const page =
|
||
await pdf.getPage(pageNum);
|
||
|
||
const content =
|
||
await page.getTextContent();
|
||
|
||
const pageText =
|
||
content.items
|
||
.map((item) => item.str)
|
||
.join(" ");
|
||
|
||
if (
|
||
!pageText ||
|
||
pageText.trim().length === 0
|
||
) {
|
||
continue;
|
||
}
|
||
|
||
const chunks = chunkText(
|
||
pageText,
|
||
1200,
|
||
250
|
||
);
|
||
|
||
for (const chunk of chunks) {
|
||
const embedding =
|
||
await createEmbedding(chunk);
|
||
|
||
batchPoints.push({
|
||
id: Number(
|
||
`${Date.now()}${globalChunkIndex}`
|
||
),
|
||
|
||
vector: embedding,
|
||
|
||
payload: {
|
||
file: fileName,
|
||
page: pageNum,
|
||
chunk: globalChunkIndex,
|
||
text: chunk,
|
||
created_at:
|
||
new Date().toISOString(),
|
||
},
|
||
});
|
||
|
||
globalChunkIndex++;
|
||
|
||
if (
|
||
batchPoints.length >= batchSize
|
||
) {
|
||
console.log(
|
||
`⬆️ Uploading ${batchPoints.length} vectors`
|
||
);
|
||
|
||
await qdrant.upsert(
|
||
COLLECTION_NAME,
|
||
{
|
||
wait: true,
|
||
points: batchPoints,
|
||
}
|
||
);
|
||
|
||
batchPoints = [];
|
||
}
|
||
}
|
||
}
|
||
|
||
if (batchPoints.length > 0) {
|
||
console.log(
|
||
`⬆️ Uploading final ${batchPoints.length} vectors`
|
||
);
|
||
|
||
await qdrant.upsert(
|
||
COLLECTION_NAME,
|
||
{
|
||
wait: true,
|
||
points: batchPoints,
|
||
}
|
||
);
|
||
}
|
||
|
||
console.log(
|
||
`✅ ${fileName} indexed successfully`
|
||
);
|
||
} catch (error) {
|
||
console.log(
|
||
`❌ Error processing ${fileName}:`,
|
||
error
|
||
);
|
||
}
|
||
}
|
||
|
||
// ======================
|
||
// MAIN
|
||
// ======================
|
||
async function main() {
|
||
try {
|
||
await loadModel();
|
||
|
||
await createCollection();
|
||
|
||
const folder =
|
||
path.join(
|
||
__dirname,
|
||
"uploads"
|
||
);
|
||
|
||
if (
|
||
!fs.existsSync(
|
||
folder
|
||
)
|
||
) {
|
||
console.log(
|
||
"❌ uploads folder not found"
|
||
);
|
||
|
||
return;
|
||
}
|
||
|
||
const files =
|
||
fs
|
||
.readdirSync(
|
||
folder
|
||
)
|
||
.filter((file) =>
|
||
file.endsWith(
|
||
".pdf"
|
||
)
|
||
);
|
||
|
||
if (
|
||
files.length === 0
|
||
) {
|
||
console.log(
|
||
"⚠️ No PDFs found"
|
||
);
|
||
|
||
return;
|
||
}
|
||
|
||
console.log(
|
||
`📚 Found ${files.length} PDFs`
|
||
);
|
||
|
||
for (const file of files) {
|
||
const filePath =
|
||
path.join(
|
||
folder,
|
||
file
|
||
);
|
||
|
||
console.log(
|
||
`\n📄 Processing ${file}`
|
||
);
|
||
|
||
await processPDF(
|
||
filePath,
|
||
file
|
||
);
|
||
}
|
||
|
||
console.log(
|
||
"\n🎉 All PDFs indexed successfully"
|
||
);
|
||
} catch (err) {
|
||
console.error(
|
||
"❌ MAIN ERROR:",
|
||
err
|
||
);
|
||
}
|
||
}
|
||
|
||
main();
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
// require("dotenv").config();
|
||
// const fs = require("fs");
|
||
// const path = require("path");
|
||
// const crypto = require("crypto");
|
||
|
||
// // ─── Suppress PDF warnings ────────────────────────────────────────────────────
|
||
// const _warn = console.warn;
|
||
// console.warn = (msg, ...a) => {
|
||
// if (typeof msg === "string" && (msg.includes("UnknownErrorException") || msg.includes("TT:"))) return;
|
||
// _warn(msg, ...a);
|
||
// };
|
||
|
||
// const pdfjsLib = require("pdfjs-dist/legacy/build/pdf.mjs");
|
||
// const { pipeline } = require("@xenova/transformers");
|
||
// const { QdrantClient } = require("@qdrant/js-client-rest");
|
||
|
||
// // ─── Config ───────────────────────────────────────────────────────────────────
|
||
// const QDRANT_URL = process.env.QDRANT_URL || "http://20.40.61.65:6333";
|
||
// const COLLECTION_NAME = "pdf_rag";
|
||
// const VECTOR_SIZE = 384;
|
||
// const CHUNK_SIZE = 1200;
|
||
// const CHUNK_OVERLAP = 250;
|
||
// const BATCH_SIZE = 100; // points per upsert call
|
||
// const EMBED_CONCURRENCY = 8; // parallel embeddings at once
|
||
// const MAX_RETRIES = 3;
|
||
|
||
// const qdrant = new QdrantClient({ url: QDRANT_URL, checkCompatibility: false, timeout: 60000 });
|
||
// let embedder;
|
||
|
||
// // ─── Semaphore ─────────────────────────────────────────────────────────────────
|
||
// class Semaphore {
|
||
// constructor(n) { this.n = n; this.queue = []; }
|
||
// acquire() {
|
||
// return new Promise(res => {
|
||
// if (this.n > 0) { this.n--; res(); }
|
||
// else this.queue.push(res);
|
||
// });
|
||
// }
|
||
// release() {
|
||
// if (this.queue.length) this.queue.shift()();
|
||
// else this.n++;
|
||
// }
|
||
// }
|
||
|
||
// // ─── Retry helper ─────────────────────────────────────────────────────────────
|
||
// async function withRetry(fn, retries = MAX_RETRIES, delay = 500) {
|
||
// for (let i = 0; i <= retries; i++) {
|
||
// try { return await fn(); }
|
||
// catch (err) {
|
||
// if (i === retries) throw err;
|
||
// console.warn(` ⚠️ Retry ${i + 1}/${retries} after error: ${err.message}`);
|
||
// await new Promise(r => setTimeout(r, delay * 2 ** i));
|
||
// }
|
||
// }
|
||
// }
|
||
|
||
// // ─── Deterministic UUID from content hash ────────────────────────────────────
|
||
// // Prevents duplicates if you re-run indexing on the same file
|
||
// function makePointId(fileName, page, chunkIndex) {
|
||
// const hash = crypto
|
||
// .createHash("sha256")
|
||
// .update(`${fileName}::${page}::${chunkIndex}`)
|
||
// .digest("hex");
|
||
// // Qdrant supports UUID strings or unsigned ints; use hex slice as UUID-like string
|
||
// return `${hash.slice(0,8)}-${hash.slice(8,12)}-${hash.slice(12,16)}-${hash.slice(16,20)}-${hash.slice(20,32)}`;
|
||
// }
|
||
|
||
// // ─── Chunking ─────────────────────────────────────────────────────────────────
|
||
// function chunkText(text, size = CHUNK_SIZE, overlap = CHUNK_OVERLAP) {
|
||
// const chunks = [];
|
||
// text = text.replace(/\s+/g, " ").trim();
|
||
// let start = 0;
|
||
// while (start < text.length) {
|
||
// let end = start + size;
|
||
// if (end < text.length) {
|
||
// const last = text.lastIndexOf(".", end);
|
||
// if (last > start) end = last + 1;
|
||
// }
|
||
// const chunk = text.slice(start, end).trim();
|
||
// if (chunk.length > 50) chunks.push(chunk);
|
||
// start = end - overlap;
|
||
// }
|
||
// return chunks;
|
||
// }
|
||
|
||
// // ─── Embedding ────────────────────────────────────────────────────────────────
|
||
// async function embed(text) {
|
||
// const out = await embedder(text, { pooling: "mean", normalize: true });
|
||
// return Array.from(out.data);
|
||
// }
|
||
|
||
// // Embed multiple texts with bounded parallelism
|
||
// async function embedBatch(texts) {
|
||
// const sem = new Semaphore(EMBED_CONCURRENCY);
|
||
// return Promise.all(
|
||
// texts.map(async (text) => {
|
||
// await sem.acquire();
|
||
// try { return await embed(text); }
|
||
// finally { sem.release(); }
|
||
// })
|
||
// );
|
||
// }
|
||
|
||
// // ─── Qdrant helpers ───────────────────────────────────────────────────────────
|
||
// async function ensureCollection() {
|
||
// try {
|
||
// await qdrant.getCollection(COLLECTION_NAME);
|
||
// console.log("ℹ️ Collection already exists");
|
||
// } catch {
|
||
// console.log("⏳ Creating collection...");
|
||
// await qdrant.createCollection(COLLECTION_NAME, {
|
||
// vectors: { size: VECTOR_SIZE, distance: "Cosine" },
|
||
// // Optimizers: tune for bulk ingest speed, re-enable indexing after
|
||
// optimizers_config: { indexing_threshold: 0 },
|
||
// });
|
||
// console.log("✅ Collection created");
|
||
// }
|
||
// }
|
||
|
||
// // Upload a batch with retry
|
||
// async function upsertBatch(points) {
|
||
// await withRetry(() =>
|
||
// qdrant.upsert(COLLECTION_NAME, { wait: true, points })
|
||
// );
|
||
// }
|
||
|
||
// // After bulk ingest, re-enable HNSW indexing
|
||
// async function enableIndexing() {
|
||
// await qdrant.updateCollection(COLLECTION_NAME, {
|
||
// optimizers_config: { indexing_threshold: 20000 },
|
||
// });
|
||
// console.log("🔧 HNSW indexing re-enabled");
|
||
// }
|
||
|
||
// // ─── Check if file already indexed ───────────────────────────────────────────
|
||
// async function isFileIndexed(fileName) {
|
||
// try {
|
||
// const result = await qdrant.scroll(COLLECTION_NAME, {
|
||
// filter: { must: [{ key: "file", match: { value: fileName } }] },
|
||
// limit: 1,
|
||
// with_payload: false,
|
||
// with_vector: false,
|
||
// });
|
||
// return result.points.length > 0;
|
||
// } catch { return false; }
|
||
// }
|
||
|
||
// // ─── Process a single PDF ─────────────────────────────────────────────────────
|
||
// async function processPDF(filePath, fileName) {
|
||
// console.log(`\n📄 ${fileName}`);
|
||
|
||
// if (await isFileIndexed(fileName)) {
|
||
// console.log(` ⏭️ Already indexed — skipping`);
|
||
// return;
|
||
// }
|
||
|
||
// const pdf = await pdfjsLib
|
||
// .getDocument({ data: new Uint8Array(fs.readFileSync(filePath)) })
|
||
// .promise;
|
||
|
||
// console.log(` 📖 ${pdf.numPages} pages`);
|
||
|
||
// const allChunks = []; // { text, page, chunkIndex }
|
||
|
||
// // 1️⃣ Extract all text first (fast, sequential is fine for I/O)
|
||
// for (let p = 1; p <= pdf.numPages; p++) {
|
||
// const page = await pdf.getPage(p);
|
||
// const content = await page.getTextContent();
|
||
// const text = content.items.map(i => i.str).join(" ");
|
||
// if (!text.trim()) continue;
|
||
// const chunks = chunkText(text);
|
||
// chunks.forEach((chunk, ci) => allChunks.push({ text: chunk, page: p, chunkIndex: allChunks.length }));
|
||
// }
|
||
|
||
// console.log(` 🧩 ${allChunks.length} chunks — embedding with concurrency=${EMBED_CONCURRENCY}`);
|
||
|
||
// // 2️⃣ Embed all chunks in parallel (bounded by semaphore)
|
||
// const start = Date.now();
|
||
// const vectors = await embedBatch(allChunks.map(c => c.text));
|
||
// const elapsed = ((Date.now() - start) / 1000).toFixed(1);
|
||
// console.log(` ⚡ Embedding done in ${elapsed}s`);
|
||
|
||
// // 3️⃣ Build points
|
||
// const points = allChunks.map((c, i) => ({
|
||
// id: makePointId(fileName, c.page, c.chunkIndex),
|
||
// vector: vectors[i],
|
||
// payload: {
|
||
// file: fileName,
|
||
// page: c.page,
|
||
// chunk: c.chunkIndex,
|
||
// text: c.text,
|
||
// created_at: new Date().toISOString(),
|
||
// },
|
||
// }));
|
||
|
||
// // 4️⃣ Batch upsert with progress
|
||
// let uploaded = 0;
|
||
// for (let i = 0; i < points.length; i += BATCH_SIZE) {
|
||
// const batch = points.slice(i, i + BATCH_SIZE);
|
||
// await upsertBatch(batch);
|
||
// uploaded += batch.length;
|
||
// process.stdout.write(`\r ⬆️ ${uploaded}/${points.length} vectors uploaded`);
|
||
// }
|
||
// console.log(`\n ✅ ${fileName} indexed`);
|
||
// }
|
||
|
||
// // ─── Main ─────────────────────────────────────────────────────────────────────
|
||
// async function main() {
|
||
// console.log("⏳ Loading embedding model...");
|
||
// embedder = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
|
||
// console.log("✅ Model loaded\n");
|
||
|
||
// await ensureCollection();
|
||
|
||
// const folder = path.join(__dirname, "uploads");
|
||
// if (!fs.existsSync(folder)) return console.log("❌ uploads/ folder not found");
|
||
|
||
// const pdfs = fs.readdirSync(folder).filter(f => f.endsWith(".pdf"));
|
||
// if (!pdfs.length) return console.log("⚠️ No PDFs found");
|
||
|
||
// console.log(`📚 Found ${pdfs.length} PDF(s)\n`);
|
||
|
||
// const t0 = Date.now();
|
||
// for (const file of pdfs) {
|
||
// await processPDF(path.join(folder, file), file);
|
||
// }
|
||
|
||
// await enableIndexing(); // re-enable HNSW after bulk load
|
||
|
||
// console.log(`\n🎉 Done in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||
// }
|
||
|
||
// main().catch(err => { console.error("❌ Fatal:", err); process.exit(1); });
|