api/carpa_json_to_markdown/document_update.mjs

157 lines
4.7 KiB
JavaScript

import Typesense from "typesense";
import dayjs from "dayjs";
import { JSDOM } from "jsdom";
const { window } = new JSDOM();
let client = new Typesense.Client({
nodes: [
{
host: "searchts.carpa.com",
port: "443",
protocol: "https",
}
],
apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
});
import fs from 'fs';
import readline from 'readline';
import { create } from "domain";
async function processJsonLines(filePath) {
const errors = [];
const stream = fs.createReadStream(filePath);
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity,
});
let lineCount = 0;
for await (const line of rl) {
lineCount = 0;
try {
// Parse the line as JSON
const data = JSON.parse(line);
//if( data.id != 62643 ) continue; // Skip until we find the specific ID
console.log('Processing: ', data.title);
const doc_id = await createDocument(data);
console.log('Document created with ID:', doc_id);
if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
// Split the body into paragraphs using <p> tags
const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
//console.log('Total paragraphs found:', paragraphs);
} else {
if( data.body !== undefined ){
const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
//console.log('Total paragraphs found:', paragraphs);
}
}
} catch (error) {
//console.error(`Error processing line ${line}:`, error);
let data = JSON.parse(line);
errors.push({ line: data.id, error: error.message });
}
//break;
}
console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
if (errors.length > 0) {
console.log('Errors:', errors);
}
}
function hasParagraphs(htmlString) {
const dom = new JSDOM(htmlString);
//const doc = parser.parseFromString(htmlString, 'text/html');
// Check if any <p> tags exist
return dom.window.document.querySelectorAll('p');
}
//processJsonLines('./input/activities_wp_ES.json');
processJsonLines('./input/conferences_wp_ES.json');
// Function to create paragraphs in Typesense
async function createParagraphs(documentId, paragraphs, locale, type) {
let lineCount = 0;
let jsonl = '';
for (const para of paragraphs) {
if(!para || para.trim() === '') continue; // Skip empty paragraphs
//Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
let fixedHtml = para.trim();
if(para == '&nbsp') continue; // Skip paragraphs that are just non-breaking spaces
if(!fixedHtml.startsWith('<p') ) {
// If the paragraph doesn't start with <p>, wrap it in <p> tags
fixedHtml = `<p>${fixedHtml}</p>`;
}
let paragraph = {
document_id: documentId,
raw: fixedHtml,
text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
number: lineCount + 1,
locale: locale,
type: type,
id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
};
jsonl += JSON.stringify(paragraph) + '\n';
lineCount++;
}
let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
return doc; // Return the number of paragraphs created
}
// Main function to create document in Typesense
async function createDocument(data){
let document = {
code: dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
locale: data.locale,
id: data.id.toString(),
type: data.type,
title: data.title,
timestamp: data.timestamp,
date: data.date,
activity: data.activity,
duration: data.duration ?? 0,
bible_study: data.bible_study,
place: data.place || null,
city: data.city || null,
state: data.state || null,
country: data.country || null,
draft: data.draft,
thumbnail: data.thumbnail,
files: {
youtube: data.youtube,
video: data.video || null,
audio: data.audio || null,
booklet: data.booklet || null,
simple: data.simple || null,
},
directus: "",
wp: data.id.toString(),
rm: data.rm,
private: data.private || false,
slug: data.slug || null,
body: "",
year: data.year,
month: data.month,
};
let doc = await client.collections('documents').documents().upsert(document);
return doc.id;
}