Updating to export to typesense search engine
This commit is contained in:
parent
2acad0c6c8
commit
8ea83f825e
|
|
@ -0,0 +1,153 @@
|
||||||
|
import Typesense from "typesense";
|
||||||
|
import dayjs from "dayjs";
|
||||||
|
import { JSDOM } from "jsdom";
|
||||||
|
|
||||||
|
const { window } = new JSDOM();
|
||||||
|
|
||||||
|
let client = new Typesense.Client({
|
||||||
|
nodes: [
|
||||||
|
{
|
||||||
|
host: "searchts.carpa.com",
|
||||||
|
port: "443",
|
||||||
|
protocol: "https",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
|
||||||
|
});
|
||||||
|
|
||||||
|
import fs from 'fs';
|
||||||
|
import readline from 'readline';
|
||||||
|
import { create } from "domain";
|
||||||
|
|
||||||
|
async function processJsonLines(filePath) {
|
||||||
|
const errors = [];
|
||||||
|
const stream = fs.createReadStream(filePath);
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: stream,
|
||||||
|
crlfDelay: Infinity,
|
||||||
|
});
|
||||||
|
|
||||||
|
let lineCount = 0;
|
||||||
|
|
||||||
|
for await (const line of rl) {
|
||||||
|
lineCount = 0;
|
||||||
|
try {
|
||||||
|
// Parse the line as JSON
|
||||||
|
const data = JSON.parse(line);
|
||||||
|
|
||||||
|
//if( data.id != 62643 ) continue; // Skip until we find the specific ID
|
||||||
|
|
||||||
|
console.log('Processing: ', data.title);
|
||||||
|
|
||||||
|
const doc_id = await createDocument(data);
|
||||||
|
|
||||||
|
console.log('Document created with ID:', doc_id);
|
||||||
|
|
||||||
|
if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
|
||||||
|
// Split the body into paragraphs using <p> tags
|
||||||
|
const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
|
||||||
|
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
|
||||||
|
//console.log('Total paragraphs found:', paragraphs);
|
||||||
|
} else {
|
||||||
|
if( data.body !== undefined ){
|
||||||
|
const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
|
||||||
|
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
|
||||||
|
//console.log('Total paragraphs found:', paragraphs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
//console.error(`Error processing line ${line}:`, error);
|
||||||
|
let data = JSON.parse(line);
|
||||||
|
errors.push({ line: data.id, error: error.message });
|
||||||
|
}
|
||||||
|
//break;
|
||||||
|
}
|
||||||
|
console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
|
||||||
|
if (errors.length > 0) {
|
||||||
|
console.log('Errors:', errors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasParagraphs(htmlString) {
|
||||||
|
const dom = new JSDOM(htmlString);
|
||||||
|
//const doc = parser.parseFromString(htmlString, 'text/html');
|
||||||
|
// Check if any <p> tags exist
|
||||||
|
return dom.window.document.querySelectorAll('p');
|
||||||
|
}
|
||||||
|
|
||||||
|
//processJsonLines('./input/activities_wp_ES.json');
|
||||||
|
processJsonLines('./input/conferences_wp_ES.json');
|
||||||
|
|
||||||
|
// Function to create paragraphs in Typesense
|
||||||
|
async function createParagraphs(documentId, paragraphs, locale, type) {
|
||||||
|
let lineCount = 0;
|
||||||
|
let jsonl = '';
|
||||||
|
for (const para of paragraphs) {
|
||||||
|
if(!para || para.trim() === '') continue; // Skip empty paragraphs
|
||||||
|
|
||||||
|
//Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
|
||||||
|
let fixedHtml = para.trim();
|
||||||
|
|
||||||
|
if(!fixedHtml.startsWith('<p')){
|
||||||
|
// If the paragraph doesn't start with <p>, wrap it in <p> tags
|
||||||
|
fixedHtml = `<p>${fixedHtml}</p>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
let paragraph = {
|
||||||
|
document_id: documentId,
|
||||||
|
raw: fixedHtml,
|
||||||
|
text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
|
||||||
|
number: lineCount + 1,
|
||||||
|
locale: locale,
|
||||||
|
type: type,
|
||||||
|
id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
|
||||||
|
};
|
||||||
|
|
||||||
|
jsonl += JSON.stringify(paragraph) + '\n';
|
||||||
|
|
||||||
|
lineCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
|
||||||
|
return doc; // Return the number of paragraphs created
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main function to create document in Typesense
|
||||||
|
async function createDocument(data){
|
||||||
|
let document = {
|
||||||
|
code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
|
||||||
|
locale: data.locale,
|
||||||
|
id: data.id.toString(),
|
||||||
|
type: data.type,
|
||||||
|
title: data.title,
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
date: data.date,
|
||||||
|
activity: data.activity,
|
||||||
|
duration: data.duration ?? 0,
|
||||||
|
bible_study: data.bible_study,
|
||||||
|
place: data.place || null,
|
||||||
|
city: data.city || null,
|
||||||
|
state: data.state || null,
|
||||||
|
country: data.country || null,
|
||||||
|
draft: data.draft,
|
||||||
|
thumbnail: data.thumbnail,
|
||||||
|
files: {
|
||||||
|
youtube: data.youtube,
|
||||||
|
video: data.files?.videos?.file || null,
|
||||||
|
audio: data.files?.audios?.[0]?.[0]?.file2 || null,
|
||||||
|
booklet: data.files?.textos?.[0]?.[1]?.file2 || null,
|
||||||
|
simple: data.files?.textos?.[0]?.[0]?.file2 || null,
|
||||||
|
},
|
||||||
|
directus: "",
|
||||||
|
wp: data.id.toString(),
|
||||||
|
//rm: data.rm,
|
||||||
|
private: false,
|
||||||
|
slug: data.translations?.[0]?.interventions?.[0]?.slug || null,
|
||||||
|
body: data.body || null
|
||||||
|
};
|
||||||
|
|
||||||
|
let doc = await client.collections('documents').documents().upsert(document);
|
||||||
|
|
||||||
|
return doc.id;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
|
|
||||||
|
const { parse } = require('json2csv');
|
||||||
const striptags = require('striptags');
|
const striptags = require('striptags');
|
||||||
const dayjs = require('dayjs');
|
const dayjs = require('dayjs');
|
||||||
const he = require('he');
|
const he = require('he');
|
||||||
|
|
@ -128,74 +129,95 @@ async function generateJson( type, year ) {
|
||||||
let nitem = {}
|
let nitem = {}
|
||||||
|
|
||||||
if (type == 'activities') {
|
if (type == 'activities') {
|
||||||
|
nitem.locale = LOCALE;
|
||||||
nitem.id = item.id.toString()
|
nitem.id = item.id.toString()
|
||||||
nitem.type = 'activities'
|
nitem.type = 'activities'
|
||||||
nitem.title = item.title
|
nitem.title = item.title
|
||||||
nitem.date = dayjs(item.date).unix()
|
nitem.timestamp = dayjs(item.date).unix()
|
||||||
|
nitem.date = item.date
|
||||||
nitem.activity = parseInt(item.activity)
|
nitem.activity = parseInt(item.activity)
|
||||||
|
nitem.duration = item.duration ?? 0;
|
||||||
nitem.bible_study = parseInt(item.bible_study)
|
nitem.bible_study = parseInt(item.bible_study)
|
||||||
nitem.place = item.place || null;
|
nitem.place = item.place || null;
|
||||||
nitem.city = item.city || null;
|
nitem.city = item.city || null;
|
||||||
nitem.state = item.state || null;
|
nitem.state = item.state || null;
|
||||||
nitem.country = item.country || null;
|
nitem.country = item.country || null;
|
||||||
nitem.duration = item.duration ?? 0;
|
|
||||||
nitem.body = item.translations?.[0]?.interventions?.[0]?.text
|
nitem.body = item.translations?.[0]?.interventions?.[0]?.text
|
||||||
//nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
|
// nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
|
||||||
nitem.draft = item.draft
|
nitem.draft = item.draft
|
||||||
nitem.private = false
|
|
||||||
nitem.year = dayjs(item.date).year().toString()
|
|
||||||
nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
|
|
||||||
nitem.thumbnail = item.thumbnail
|
nitem.thumbnail = item.thumbnail
|
||||||
nitem.slug = item.translations[0]?.interventions[0]?.slug
|
|
||||||
nitem.files = {}
|
nitem.files = {}
|
||||||
nitem.files.youtube = item.youtube
|
nitem.youtube = item.youtube
|
||||||
nitem.files.video = item.files?.videos?.file
|
nitem.video = item.files?.videos?.file
|
||||||
nitem.files.audio = item.files?.audios[0]?.[0]?.file2
|
nitem.audio = item.files?.audios[0]?.[0]?.file2
|
||||||
nitem.files.booklet = item.files?.textos[0]?.[1]?.file2
|
nitem.booklet = item.files?.textos[0]?.[1]?.file2
|
||||||
|
nitem.simple = item.files?.textos[0]?.[0]?.file2
|
||||||
|
nitem.directus = "";
|
||||||
|
nitem.wp = item.id.toString()
|
||||||
|
nitem.typesense = true;
|
||||||
|
nitem.rm = item.rm;
|
||||||
|
nitem.private = false
|
||||||
|
nitem.slug = item.translations[0]?.interventions[0]?.slug
|
||||||
|
|
||||||
//Filter out anything before 25/12/2021
|
//Filter out anything before 25/12/2021
|
||||||
if( nitem.date < 1640408400 ) {
|
// if( nitem.date < 1640408400 ) {
|
||||||
return false
|
// return false
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type == 'conferences') {
|
if (type == 'conferences') {
|
||||||
|
nitem.locale = LOCALE;
|
||||||
nitem.id = item.id.toString()
|
nitem.id = item.id.toString()
|
||||||
nitem.type = 'conferences'
|
nitem.type = 'conferences'
|
||||||
nitem.title = item.title
|
nitem.title = item.title
|
||||||
nitem.date = item.timestamp
|
nitem.timestamp = item.timestamp
|
||||||
|
nitem.date = ''
|
||||||
nitem.activity = parseInt(item.activity)
|
nitem.activity = parseInt(item.activity)
|
||||||
|
nitem.duration = item.duration ?? 0;
|
||||||
//nitem.bible_study = parseInt(item.bible_study)
|
//nitem.bible_study = parseInt(item.bible_study)
|
||||||
//nitem.place = item.conferences_id?.place || null;
|
//nitem.place = item.conferences_id?.place || null;
|
||||||
|
nitem.place = '';
|
||||||
nitem.city = item.city || null;
|
nitem.city = item.city || null;
|
||||||
nitem.state = item.state || null;
|
nitem.state = item.state || null;
|
||||||
nitem.country = item.country || null;
|
nitem.country = item.country || null;
|
||||||
nitem.duration = item.duration ?? 0;
|
nitem.body = item.body;
|
||||||
nitem.body = item.body
|
nitem.thumbnail = item.thumbnail
|
||||||
|
nitem.youtube = item.files?.youtube
|
||||||
|
nitem.video = item.files?.video
|
||||||
|
nitem.audio = item.files?.audio
|
||||||
|
nitem.booklet = item.files?.pdf
|
||||||
|
nitem.simple = item.files?.pdf_simple
|
||||||
|
nitem.directus = '';
|
||||||
|
nitem.wp = item.id.toString();
|
||||||
|
nitem.typesense = true;
|
||||||
|
nitem.rm = item.rm;
|
||||||
nitem.private = item.private == 1 ? true : false;
|
nitem.private = item.private == 1 ? true : false;
|
||||||
//nitem.draft = item.draft
|
//nitem.draft = item.draft
|
||||||
nitem.year = dayjs(item.date).year().toString()
|
//nitem.year = dayjs(item.date).year().toString()
|
||||||
nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
|
//nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
|
||||||
nitem.thumbnail = item.thumbnail
|
|
||||||
nitem.slug = item.slug
|
nitem.slug = item.slug
|
||||||
nitem.files = {}
|
//nitem.files = {}
|
||||||
nitem.files.youtube = item.files?.youtube
|
|
||||||
nitem.files.video = item.files?.video
|
|
||||||
nitem.files.audio = item.files?.audio
|
|
||||||
nitem.files.booklet = item.files?.pdf
|
|
||||||
nitem.files.simple = item.files?.pdf_simple
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nitem.title && nitem.id) {
|
|
||||||
return nitem;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if (nitem.title && nitem.id) {
|
||||||
|
// return nitem;
|
||||||
|
// }
|
||||||
|
return nitem;
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if( nitems.length > 0 ){
|
||||||
|
//let csv = parse(nitems);
|
||||||
|
// jsonlData += csv
|
||||||
|
//let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
|
||||||
|
//return csv;
|
||||||
jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
|
jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
|
||||||
|
//jsonlData += (nitems.filter(item => item !== false))
|
||||||
|
//jsonlData += csv;
|
||||||
|
|
||||||
|
//console.log( csv );
|
||||||
//console.log( documents.length + " documents to write for year " + year + " and type " + type );
|
//console.log( documents.length + " documents to write for year " + year + " and type " + type );
|
||||||
writeFile(jsonlData, type, year)
|
}
|
||||||
|
writeFile(jsonlData, type, year)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
@ -226,7 +248,7 @@ for( let year = 1974; year < 2019; year++){
|
||||||
generateJson( 'conferences', year );
|
generateJson( 'conferences', year );
|
||||||
}
|
}
|
||||||
|
|
||||||
// for( let year = 2021; year < 2027; year++){
|
// for( let year = 2021; year < 2027; year++){
|
||||||
// generateJson( 'activities', year );
|
// generateJson( 'activities', year );
|
||||||
// //generateJson( 'activities_translations' );
|
// // //generateJson( 'activities_translations' );
|
||||||
// }
|
// }
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -9,15 +9,22 @@
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"description": "",
|
"description": "",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@babel/runtime": "^7.29.2",
|
||||||
|
"@xmldom/xmldom": "^0.9.10",
|
||||||
"cheerio": "^1.2.0",
|
"cheerio": "^1.2.0",
|
||||||
"dayjs": "^1.11.19",
|
"dayjs": "^1.11.19",
|
||||||
"he": "^1.2.0",
|
"he": "^1.2.0",
|
||||||
|
"js-jsonl": "^1.1.1",
|
||||||
|
"jsdom": "^29.1.1",
|
||||||
"json-to-frontmatter-markdown": "^1.0.0",
|
"json-to-frontmatter-markdown": "^1.0.0",
|
||||||
"json-to-jsonl": "^1.1.0",
|
"json-to-jsonl": "^1.1.0",
|
||||||
|
"json2csv": "^6.0.0-alpha.2",
|
||||||
|
"jsonlines": "^0.1.1",
|
||||||
"mustache": "^4.2.0",
|
"mustache": "^4.2.0",
|
||||||
"mysql": "^2.18.1",
|
"mysql": "^2.18.1",
|
||||||
"node-fetch": "^3.3.2",
|
"node-fetch": "^3.3.2",
|
||||||
"request": "^2.88.2",
|
"request": "^2.88.2",
|
||||||
"striptags": "^3.2.0"
|
"striptags": "^3.2.0",
|
||||||
|
"typesense": "^3.0.6"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue