Updating to export to typesense search engine

This commit is contained in:
Julio Ruiz 2026-05-16 22:00:07 -05:00
parent 2acad0c6c8
commit 8ea83f825e
4 changed files with 1127 additions and 40 deletions

View File

@ -0,0 +1,153 @@
import Typesense from "typesense";
import dayjs from "dayjs";
import { JSDOM } from "jsdom";
const { window } = new JSDOM();
let client = new Typesense.Client({
nodes: [
{
host: "searchts.carpa.com",
port: "443",
protocol: "https",
}
],
apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
});
import fs from 'fs';
import readline from 'readline';
import { create } from "domain";
async function processJsonLines(filePath) {
const errors = [];
const stream = fs.createReadStream(filePath);
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity,
});
let lineCount = 0;
for await (const line of rl) {
lineCount = 0;
try {
// Parse the line as JSON
const data = JSON.parse(line);
//if( data.id != 62643 ) continue; // Skip until we find the specific ID
console.log('Processing: ', data.title);
const doc_id = await createDocument(data);
console.log('Document created with ID:', doc_id);
if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
// Split the body into paragraphs using <p> tags
const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
//console.log('Total paragraphs found:', paragraphs);
} else {
if( data.body !== undefined ){
const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
//console.log('Total paragraphs found:', paragraphs);
}
}
} catch (error) {
//console.error(`Error processing line ${line}:`, error);
let data = JSON.parse(line);
errors.push({ line: data.id, error: error.message });
}
//break;
}
console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
if (errors.length > 0) {
console.log('Errors:', errors);
}
}
function hasParagraphs(htmlString) {
const dom = new JSDOM(htmlString);
//const doc = parser.parseFromString(htmlString, 'text/html');
// Check if any <p> tags exist
return dom.window.document.querySelectorAll('p');
}
//processJsonLines('./input/activities_wp_ES.json');
processJsonLines('./input/conferences_wp_ES.json');
// Function to create paragraphs in Typesense
async function createParagraphs(documentId, paragraphs, locale, type) {
let lineCount = 0;
let jsonl = '';
for (const para of paragraphs) {
if(!para || para.trim() === '') continue; // Skip empty paragraphs
//Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
let fixedHtml = para.trim();
if(!fixedHtml.startsWith('<p')){
// If the paragraph doesn't start with <p>, wrap it in <p> tags
fixedHtml = `<p>${fixedHtml}</p>`;
}
let paragraph = {
document_id: documentId,
raw: fixedHtml,
text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
number: lineCount + 1,
locale: locale,
type: type,
id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
};
jsonl += JSON.stringify(paragraph) + '\n';
lineCount++;
}
let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
return doc; // Return the number of paragraphs created
}
// Main function to create document in Typesense
async function createDocument(data){
let document = {
code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
locale: data.locale,
id: data.id.toString(),
type: data.type,
title: data.title,
timestamp: data.timestamp,
date: data.date,
activity: data.activity,
duration: data.duration ?? 0,
bible_study: data.bible_study,
place: data.place || null,
city: data.city || null,
state: data.state || null,
country: data.country || null,
draft: data.draft,
thumbnail: data.thumbnail,
files: {
youtube: data.youtube,
video: data.files?.videos?.file || null,
audio: data.files?.audios?.[0]?.[0]?.file2 || null,
booklet: data.files?.textos?.[0]?.[1]?.file2 || null,
simple: data.files?.textos?.[0]?.[0]?.file2 || null,
},
directus: "",
wp: data.id.toString(),
//rm: data.rm,
private: false,
slug: data.translations?.[0]?.interventions?.[0]?.slug || null,
body: data.body || null
};
let doc = await client.collections('documents').documents().upsert(document);
return doc.id;
}

View File

@ -3,6 +3,7 @@
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
const { parse } = require('json2csv');
const striptags = require('striptags'); const striptags = require('striptags');
const dayjs = require('dayjs'); const dayjs = require('dayjs');
const he = require('he'); const he = require('he');
@ -128,74 +129,95 @@ async function generateJson( type, year ) {
let nitem = {} let nitem = {}
if (type == 'activities') { if (type == 'activities') {
nitem.locale = LOCALE;
nitem.id = item.id.toString() nitem.id = item.id.toString()
nitem.type = 'activities' nitem.type = 'activities'
nitem.title = item.title nitem.title = item.title
nitem.date = dayjs(item.date).unix() nitem.timestamp = dayjs(item.date).unix()
nitem.date = item.date
nitem.activity = parseInt(item.activity) nitem.activity = parseInt(item.activity)
nitem.duration = item.duration ?? 0;
nitem.bible_study = parseInt(item.bible_study) nitem.bible_study = parseInt(item.bible_study)
nitem.place = item.place || null; nitem.place = item.place || null;
nitem.city = item.city || null; nitem.city = item.city || null;
nitem.state = item.state || null; nitem.state = item.state || null;
nitem.country = item.country || null; nitem.country = item.country || null;
nitem.duration = item.duration ?? 0;
nitem.body = item.translations?.[0]?.interventions?.[0]?.text nitem.body = item.translations?.[0]?.interventions?.[0]?.text
//nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || '')) // nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
nitem.draft = item.draft nitem.draft = item.draft
nitem.private = false
nitem.year = dayjs(item.date).year().toString()
nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
nitem.thumbnail = item.thumbnail nitem.thumbnail = item.thumbnail
nitem.slug = item.translations[0]?.interventions[0]?.slug
nitem.files = {} nitem.files = {}
nitem.files.youtube = item.youtube nitem.youtube = item.youtube
nitem.files.video = item.files?.videos?.file nitem.video = item.files?.videos?.file
nitem.files.audio = item.files?.audios[0]?.[0]?.file2 nitem.audio = item.files?.audios[0]?.[0]?.file2
nitem.files.booklet = item.files?.textos[0]?.[1]?.file2 nitem.booklet = item.files?.textos[0]?.[1]?.file2
nitem.simple = item.files?.textos[0]?.[0]?.file2
nitem.directus = "";
nitem.wp = item.id.toString()
nitem.typesense = true;
nitem.rm = item.rm;
nitem.private = false
nitem.slug = item.translations[0]?.interventions[0]?.slug
//Filter out anything before 25/12/2021 //Filter out anything before 25/12/2021
if( nitem.date < 1640408400 ) { // if( nitem.date < 1640408400 ) {
return false // return false
} // }
} }
if (type == 'conferences') { if (type == 'conferences') {
nitem.locale = LOCALE;
nitem.id = item.id.toString() nitem.id = item.id.toString()
nitem.type = 'conferences' nitem.type = 'conferences'
nitem.title = item.title nitem.title = item.title
nitem.date = item.timestamp nitem.timestamp = item.timestamp
nitem.date = ''
nitem.activity = parseInt(item.activity) nitem.activity = parseInt(item.activity)
nitem.duration = item.duration ?? 0;
//nitem.bible_study = parseInt(item.bible_study) //nitem.bible_study = parseInt(item.bible_study)
//nitem.place = item.conferences_id?.place || null; //nitem.place = item.conferences_id?.place || null;
nitem.place = '';
nitem.city = item.city || null; nitem.city = item.city || null;
nitem.state = item.state || null; nitem.state = item.state || null;
nitem.country = item.country || null; nitem.country = item.country || null;
nitem.duration = item.duration ?? 0; nitem.body = item.body;
nitem.body = item.body nitem.thumbnail = item.thumbnail
nitem.youtube = item.files?.youtube
nitem.video = item.files?.video
nitem.audio = item.files?.audio
nitem.booklet = item.files?.pdf
nitem.simple = item.files?.pdf_simple
nitem.directus = '';
nitem.wp = item.id.toString();
nitem.typesense = true;
nitem.rm = item.rm;
nitem.private = item.private == 1 ? true : false; nitem.private = item.private == 1 ? true : false;
//nitem.draft = item.draft //nitem.draft = item.draft
nitem.year = dayjs(item.date).year().toString() //nitem.year = dayjs(item.date).year().toString()
nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0") //nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
nitem.thumbnail = item.thumbnail
nitem.slug = item.slug nitem.slug = item.slug
nitem.files = {} //nitem.files = {}
nitem.files.youtube = item.files?.youtube
nitem.files.video = item.files?.video
nitem.files.audio = item.files?.audio
nitem.files.booklet = item.files?.pdf
nitem.files.simple = item.files?.pdf_simple
}
if (nitem.title && nitem.id) {
return nitem;
} }
// if (nitem.title && nitem.id) {
// return nitem;
// }
return nitem;
}) })
if( nitems.length > 0 ){
//let csv = parse(nitems);
// jsonlData += csv
//let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
//return csv;
jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n') jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
//jsonlData += (nitems.filter(item => item !== false))
//jsonlData += csv;
//console.log( csv );
//console.log( documents.length + " documents to write for year " + year + " and type " + type ); //console.log( documents.length + " documents to write for year " + year + " and type " + type );
writeFile(jsonlData, type, year) }
writeFile(jsonlData, type, year)
}; };
} }
}); });
@ -226,7 +248,7 @@ for( let year = 1974; year < 2019; year++){
generateJson( 'conferences', year ); generateJson( 'conferences', year );
} }
// for( let year = 2021; year < 2027; year++){ // for( let year = 2021; year < 2027; year++){
// generateJson( 'activities', year ); // generateJson( 'activities', year );
// //generateJson( 'activities_translations' ); // // //generateJson( 'activities_translations' );
// } // }

File diff suppressed because it is too large Load Diff

View File

@ -9,15 +9,22 @@
"license": "ISC", "license": "ISC",
"description": "", "description": "",
"dependencies": { "dependencies": {
"@babel/runtime": "^7.29.2",
"@xmldom/xmldom": "^0.9.10",
"cheerio": "^1.2.0", "cheerio": "^1.2.0",
"dayjs": "^1.11.19", "dayjs": "^1.11.19",
"he": "^1.2.0", "he": "^1.2.0",
"js-jsonl": "^1.1.1",
"jsdom": "^29.1.1",
"json-to-frontmatter-markdown": "^1.0.0", "json-to-frontmatter-markdown": "^1.0.0",
"json-to-jsonl": "^1.1.0", "json-to-jsonl": "^1.1.0",
"json2csv": "^6.0.0-alpha.2",
"jsonlines": "^0.1.1",
"mustache": "^4.2.0", "mustache": "^4.2.0",
"mysql": "^2.18.1", "mysql": "^2.18.1",
"node-fetch": "^3.3.2", "node-fetch": "^3.3.2",
"request": "^2.88.2", "request": "^2.88.2",
"striptags": "^3.2.0" "striptags": "^3.2.0",
"typesense": "^3.0.6"
} }
} }