import Typesense from "typesense"; import dayjs from "dayjs"; import { JSDOM } from "jsdom"; const { window } = new JSDOM(); let client = new Typesense.Client({ nodes: [ { host: "searchts.carpa.com", port: "443", protocol: "https", } ], apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0", }); import fs from 'fs'; import readline from 'readline'; import { create } from "domain"; async function processJsonLines(filePath) { const errors = []; const stream = fs.createReadStream(filePath); const rl = readline.createInterface({ input: stream, crlfDelay: Infinity, }); let lineCount = 0; for await (const line of rl) { lineCount = 0; try { // Parse the line as JSON const data = JSON.parse(line); //if( data.id != 62643 ) continue; // Skip until we find the specific ID console.log('Processing: ', data.title); const doc_id = await createDocument(data); console.log('Document created with ID:', doc_id); if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) { // Split the body into paragraphs using
tags const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean); const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type); //console.log('Total paragraphs found:', paragraphs); } else { if( data.body !== undefined ){ const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean); const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type); //console.log('Total paragraphs found:', paragraphs); } } } catch (error) { //console.error(`Error processing line ${line}:`, error); let data = JSON.parse(line); errors.push({ line: data.id, error: error.message }); } //break; } console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`); if (errors.length > 0) { console.log('Errors:', errors); } } function hasParagraphs(htmlString) { const dom = new JSDOM(htmlString); //const doc = parser.parseFromString(htmlString, 'text/html'); // Check if any
tags exist return dom.window.document.querySelectorAll('p'); } //processJsonLines('./input/activities_wp_ES.json'); processJsonLines('./input/conferences_wp_ES.json'); // Function to create paragraphs in Typesense async function createParagraphs(documentId, paragraphs, locale, type) { let lineCount = 0; let jsonl = ''; for (const para of paragraphs) { if(!para || para.trim() === '') continue; // Skip empty paragraphs //Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes. let fixedHtml = para.trim(); if(!fixedHtml.startsWith('
, wrap it in
tags fixedHtml = `
${fixedHtml}
`; } let paragraph = { document_id: documentId, raw: fixedHtml, text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search number: lineCount + 1, locale: locale, type: type, id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph }; jsonl += JSON.stringify(paragraph) + '\n'; lineCount++; } let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'}) return doc; // Return the number of paragraphs created } // Main function to create document in Typesense async function createDocument(data){ let document = { code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity, locale: data.locale, id: data.id.toString(), type: data.type, title: data.title, timestamp: data.timestamp, date: data.date, activity: data.activity, duration: data.duration ?? 0, bible_study: data.bible_study, place: data.place || null, city: data.city || null, state: data.state || null, country: data.country || null, draft: data.draft, thumbnail: data.thumbnail, files: { youtube: data.youtube, video: data.files?.videos?.file || null, audio: data.files?.audios?.[0]?.[0]?.file2 || null, booklet: data.files?.textos?.[0]?.[1]?.file2 || null, simple: data.files?.textos?.[0]?.[0]?.file2 || null, }, directus: "", wp: data.id.toString(), //rm: data.rm, private: false, slug: data.translations?.[0]?.interventions?.[0]?.slug || null, body: data.body || null }; let doc = await client.collections('documents').documents().upsert(document); return doc.id; }