157 lines
4.7 KiB
JavaScript
157 lines
4.7 KiB
JavaScript
import Typesense from "typesense";
|
|
import dayjs from "dayjs";
|
|
import { JSDOM } from "jsdom";
|
|
|
|
const { window } = new JSDOM();
|
|
|
|
let client = new Typesense.Client({
|
|
nodes: [
|
|
{
|
|
host: "searchts.carpa.com",
|
|
port: "443",
|
|
protocol: "https",
|
|
}
|
|
],
|
|
apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
|
|
});
|
|
|
|
import fs from 'fs';
|
|
import readline from 'readline';
|
|
import { create } from "domain";
|
|
|
|
async function processJsonLines(filePath) {
|
|
const errors = [];
|
|
const stream = fs.createReadStream(filePath);
|
|
const rl = readline.createInterface({
|
|
input: stream,
|
|
crlfDelay: Infinity,
|
|
});
|
|
|
|
let lineCount = 0;
|
|
|
|
for await (const line of rl) {
|
|
lineCount = 0;
|
|
try {
|
|
// Parse the line as JSON
|
|
const data = JSON.parse(line);
|
|
|
|
//if( data.id != 62643 ) continue; // Skip until we find the specific ID
|
|
|
|
console.log('Processing: ', data.title);
|
|
|
|
const doc_id = await createDocument(data);
|
|
|
|
console.log('Document created with ID:', doc_id);
|
|
|
|
if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
|
|
// Split the body into paragraphs using <p> tags
|
|
const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
|
|
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
|
|
//console.log('Total paragraphs found:', paragraphs);
|
|
} else {
|
|
if( data.body !== undefined ){
|
|
const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
|
|
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
|
|
//console.log('Total paragraphs found:', paragraphs);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
//console.error(`Error processing line ${line}:`, error);
|
|
let data = JSON.parse(line);
|
|
errors.push({ line: data.id, error: error.message });
|
|
}
|
|
//break;
|
|
}
|
|
console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
|
|
if (errors.length > 0) {
|
|
console.log('Errors:', errors);
|
|
}
|
|
}
|
|
|
|
function hasParagraphs(htmlString) {
|
|
const dom = new JSDOM(htmlString);
|
|
//const doc = parser.parseFromString(htmlString, 'text/html');
|
|
// Check if any <p> tags exist
|
|
return dom.window.document.querySelectorAll('p');
|
|
}
|
|
|
|
//processJsonLines('./input/activities_wp_ES.json');
|
|
processJsonLines('./input/conferences_wp_ES.json');
|
|
|
|
// Function to create paragraphs in Typesense
|
|
async function createParagraphs(documentId, paragraphs, locale, type) {
|
|
let lineCount = 0;
|
|
let jsonl = '';
|
|
for (const para of paragraphs) {
|
|
if(!para || para.trim() === '') continue; // Skip empty paragraphs
|
|
|
|
//Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
|
|
let fixedHtml = para.trim();
|
|
|
|
if(para == ' ') continue; // Skip paragraphs that are just non-breaking spaces
|
|
|
|
if(!fixedHtml.startsWith('<p') ) {
|
|
// If the paragraph doesn't start with <p>, wrap it in <p> tags
|
|
fixedHtml = `<p>${fixedHtml}</p>`;
|
|
}
|
|
|
|
let paragraph = {
|
|
document_id: documentId,
|
|
raw: fixedHtml,
|
|
text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
|
|
number: lineCount + 1,
|
|
locale: locale,
|
|
type: type,
|
|
id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
|
|
};
|
|
|
|
jsonl += JSON.stringify(paragraph) + '\n';
|
|
|
|
lineCount++;
|
|
}
|
|
|
|
let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
|
|
return doc; // Return the number of paragraphs created
|
|
}
|
|
|
|
// Main function to create document in Typesense
|
|
async function createDocument(data){
|
|
let document = {
|
|
code: dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
|
|
locale: data.locale,
|
|
id: data.id.toString(),
|
|
type: data.type,
|
|
title: data.title,
|
|
timestamp: data.timestamp,
|
|
date: data.date,
|
|
activity: data.activity,
|
|
duration: data.duration ?? 0,
|
|
bible_study: data.bible_study,
|
|
place: data.place || null,
|
|
city: data.city || null,
|
|
state: data.state || null,
|
|
country: data.country || null,
|
|
draft: data.draft,
|
|
thumbnail: data.thumbnail,
|
|
files: {
|
|
youtube: data.youtube,
|
|
video: data.video || null,
|
|
audio: data.audio || null,
|
|
booklet: data.booklet || null,
|
|
simple: data.simple || null,
|
|
},
|
|
directus: "",
|
|
wp: data.id.toString(),
|
|
rm: data.rm,
|
|
private: data.private || false,
|
|
slug: data.slug || null,
|
|
body: "",
|
|
year: data.year,
|
|
month: data.month,
|
|
};
|
|
|
|
let doc = await client.collections('documents').documents().upsert(document);
|
|
|
|
return doc.id;
|
|
|
|
} |