api/carpa_json_to_markdown/document_update.mjs

import Typesense from "typesense";
import dayjs from "dayjs";
import { JSDOM } from "jsdom";

const { window } = new JSDOM();

let client = new Typesense.Client({
  nodes: [
    {
      host: "searchts.carpa.com",
      port: "443",
      protocol: "https",
    }
  ],
  apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
});

import fs from 'fs';
import readline from 'readline';
import { create } from "domain";

async function processJsonLines(filePath) {
  const errors = [];
  const stream = fs.createReadStream(filePath);
  const rl = readline.createInterface({
    input: stream,
    crlfDelay: Infinity,
  });

  let lineCount = 0;

  for await (const line of rl) {
    lineCount = 0;
    try {
      // Parse the line as JSON
      const data = JSON.parse(line);

      //if( data.id != 62643 ) continue; // Skip until we find the specific ID

      console.log('Processing: ', data.title);

      const doc_id = await createDocument(data);

      console.log('Document created with ID:', doc_id);

      if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
        // Split the body into paragraphs using <p> tags
        const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
        //console.log('Total paragraphs found:', paragraphs);
      } else {
        if( data.body !== undefined ){
        const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
        //console.log('Total paragraphs found:', paragraphs);
        }
      }
    } catch (error) {
      //console.error(`Error processing line ${line}:`, error);
      let data = JSON.parse(line);
      errors.push({ line: data.id, error: error.message });
    }
    //break;
  }
  console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
  if (errors.length > 0) {
    console.log('Errors:', errors);
  }
}

function hasParagraphs(htmlString) {
  const dom = new JSDOM(htmlString);
  //const doc = parser.parseFromString(htmlString, 'text/html');
  // Check if any <p> tags exist
  return dom.window.document.querySelectorAll('p');
}

//processJsonLines('./input/activities_wp_ES.json');
processJsonLines('./input/conferences_wp_ES.json');

// Function to create paragraphs in Typesense
async function createParagraphs(documentId, paragraphs, locale, type) {
  let lineCount = 0;
  let jsonl = '';
  for (const para of paragraphs) {
    if(!para || para.trim() === '') continue; // Skip empty paragraphs

    //Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
    let fixedHtml = para.trim();

    if(para == '&nbsp') continue; // Skip paragraphs that are just non-breaking spaces

    if(!fixedHtml.startsWith('<p') ) {
      // If the paragraph doesn't start with <p>, wrap it in <p> tags
      fixedHtml = `<p>${fixedHtml}</p>`;
    }

    let paragraph = {
      document_id: documentId,
      raw: fixedHtml,
      text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
      number: lineCount + 1,
      locale: locale,
      type: type,
      id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
    };

    jsonl += JSON.stringify(paragraph) + '\n';

    lineCount++;
  }

  let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
  return doc; // Return the number of paragraphs created
}

// Main function to create document in Typesense
async function createDocument(data){
  let document = {
    code: dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
    locale: data.locale,
    id: data.id.toString(),
    type: data.type,
    title: data.title,
    timestamp: data.timestamp,
    date: data.date,
    activity: data.activity,
    duration: data.duration ?? 0,
    bible_study: data.bible_study,
    place: data.place || null,
    city: data.city || null,
    state: data.state || null,
    country: data.country || null,
    draft: data.draft,
    thumbnail: data.thumbnail,
    files: {
      youtube: data.youtube,
      video: data.video || null,
      audio: data.audio || null,
      booklet: data.booklet || null,
      simple: data.simple || null,
    },
    directus: "",
    wp: data.id.toString(),
    rm: data.rm,
    private: data.private || false,
    slug: data.slug || null,
    body: "",
    year: data.year,
    month: data.month,
  };

  let doc = await client.collections('documents').documents().upsert(document);

  return doc.id;

}