Updating to export to typesense search engine

2026-05-16 22:00:07 -05:00 · 2026-05-16 22:00:07 -05:00 · 8ea83f825e
parent 2acad0c6c8
commit 8ea83f825e
4 changed files with 1127 additions and 40 deletions
--- a/carpa_json_to_markdown/document_update.mjs
+++ b/carpa_json_to_markdown/document_update.mjs
@ -0,0 +1,153 @@
 import Typesense from "typesense";
 import dayjs from "dayjs";
 import { JSDOM } from "jsdom";
 const { window } = new JSDOM();
 let client = new Typesense.Client({
  nodes: [
    {
      host: "searchts.carpa.com",
      port: "443",
      protocol: "https",
    }
  ],
  apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
 });
 import fs from 'fs';
 import readline from 'readline';
 import { create } from "domain";
 async function processJsonLines(filePath) {
  const errors = [];
  const stream = fs.createReadStream(filePath);
  const rl = readline.createInterface({
    input: stream,
    crlfDelay: Infinity,
  });
  let lineCount = 0;
  for await (const line of rl) {
    lineCount = 0;
    try {
      // Parse the line as JSON
      const data = JSON.parse(line);
      //if( data.id != 62643 ) continue; // Skip until we find the specific ID
      console.log('Processing: ', data.title);
      const doc_id = await createDocument(data);
      console.log('Document created with ID:', doc_id);
      if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
        // Split the body into paragraphs using <p> tags
        const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
        //console.log('Total paragraphs found:', paragraphs);
      } else {
        if( data.body !== undefined ){
        const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
        //console.log('Total paragraphs found:', paragraphs);
        }
      }
    } catch (error) {
      //console.error(`Error processing line ${line}:`, error);
      let data = JSON.parse(line);
      errors.push({ line: data.id, error: error.message });
    }
    //break;
  }
  console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
  if (errors.length > 0) {
    console.log('Errors:', errors);
  }
 }
 function hasParagraphs(htmlString) {
  const dom = new JSDOM(htmlString);
  //const doc = parser.parseFromString(htmlString, 'text/html');
  // Check if any <p> tags exist
  return dom.window.document.querySelectorAll('p');
 }
 //processJsonLines('./input/activities_wp_ES.json');
 processJsonLines('./input/conferences_wp_ES.json');
 // Function to create paragraphs in Typesense
 async function createParagraphs(documentId, paragraphs, locale, type) {
  let lineCount = 0;
  let jsonl = '';
  for (const para of paragraphs) {
    if(!para || para.trim() === '') continue; // Skip empty paragraphs
    //Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
    let fixedHtml = para.trim();
    if(!fixedHtml.startsWith('<p')){
      // If the paragraph doesn't start with <p>, wrap it in <p> tags
      fixedHtml = `<p>${fixedHtml}</p>`;
    }
    let paragraph = {
      document_id: documentId,
      raw: fixedHtml,
      text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
      number: lineCount + 1,
      locale: locale,
      type: type,
      id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
    };
    jsonl += JSON.stringify(paragraph) + '\n';
    lineCount++;
  }
  let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
  return doc; // Return the number of paragraphs created
 }
 // Main function to create document in Typesense
 async function createDocument(data){
  let document = {
    code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
    locale: data.locale,
    id: data.id.toString(),
    type: data.type,
    title: data.title,
    timestamp: data.timestamp,
    date: data.date,
    activity: data.activity,
    duration: data.duration ?? 0,
    bible_study: data.bible_study,
    place: data.place || null,
    city: data.city || null,
    state: data.state || null,
    country: data.country || null,
    draft: data.draft,
    thumbnail: data.thumbnail,
    files: {
      youtube: data.youtube,
      video: data.files?.videos?.file || null,
      audio: data.files?.audios?.[0]?.[0]?.file2 || null,
      booklet: data.files?.textos?.[0]?.[1]?.file2 || null,
      simple: data.files?.textos?.[0]?.[0]?.file2 || null,  
    },
    directus: "",
    wp: data.id.toString(),
    //rm: data.rm,
    private: false,
    slug: data.translations?.[0]?.interventions?.[0]?.slug || null,
    body: data.body || null
  };
  let doc = await client.collections('documents').documents().upsert(document);
  return doc.id;
 }
--- a/carpa_json_to_markdown/index_wp.js
+++ b/carpa_json_to_markdown/index_wp.js
@ -3,6 +3,7 @@
 const fs = require('fs');
 const path = require('path');
 const { parse } = require('json2csv');
 const striptags = require('striptags');
 const dayjs = require('dayjs');
 const he = require('he');
@ -128,74 +129,95 @@ async function generateJson( type, year ) {
          let nitem = {}
          if (type == 'activities') {
            nitem.locale = LOCALE;
            nitem.id = item.id.toString()
            nitem.type = 'activities'
            nitem.title = item.title
-            nitem.date = dayjs(item.date).unix()
+            nitem.timestamp = dayjs(item.date).unix()
            nitem.date = item.date
            nitem.activity = parseInt(item.activity)
            nitem.duration = item.duration ?? 0;
            nitem.bible_study = parseInt(item.bible_study)
            nitem.place = item.place || null;
            nitem.city = item.city || null;
            nitem.state = item.state || null;
            nitem.country = item.country || null;
            nitem.duration = item.duration ?? 0;
            nitem.body = item.translations?.[0]?.interventions?.[0]?.text
-            //nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
+//            nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
            nitem.draft = item.draft
            nitem.private = false
            nitem.year = dayjs(item.date).year().toString()
            nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
            nitem.thumbnail = item.thumbnail
            nitem.slug = item.translations[0]?.interventions[0]?.slug
            nitem.files = {}
-            nitem.files.youtube = item.youtube
+            nitem.youtube = item.youtube
-            nitem.files.video = item.files?.videos?.file
+            nitem.video = item.files?.videos?.file
-            nitem.files.audio = item.files?.audios[0]?.[0]?.file2
+            nitem.audio = item.files?.audios[0]?.[0]?.file2
-            nitem.files.booklet = item.files?.textos[0]?.[1]?.file2
+            nitem.booklet = item.files?.textos[0]?.[1]?.file2
            nitem.simple = item.files?.textos[0]?.[0]?.file2
            nitem.directus = "";
            nitem.wp = item.id.toString()
            nitem.typesense = true;
            nitem.rm = item.rm;
            nitem.private = false
            nitem.slug = item.translations[0]?.interventions[0]?.slug
            //Filter out anything before 25/12/2021
-            if( nitem.date < 1640408400 ) {
+            // if( nitem.date < 1640408400 ) {
-              return false
+            //   return false
-            }
+            // }
          }
          if (type == 'conferences') {
            nitem.locale = LOCALE;
            nitem.id = item.id.toString()
            nitem.type = 'conferences'
            nitem.title = item.title
-            nitem.date = item.timestamp
+            nitem.timestamp = item.timestamp
            nitem.date = ''
            nitem.activity = parseInt(item.activity)
            nitem.duration = item.duration ?? 0;
            //nitem.bible_study = parseInt(item.bible_study)
            //nitem.place = item.conferences_id?.place || null;
            nitem.place = '';
            nitem.city = item.city || null;
            nitem.state = item.state || null;
            nitem.country = item.country || null;
-            nitem.duration = item.duration ?? 0;
+            nitem.body = item.body;
-            nitem.body = item.body
+            nitem.thumbnail = item.thumbnail
            nitem.youtube = item.files?.youtube
            nitem.video = item.files?.video
            nitem.audio = item.files?.audio
            nitem.booklet = item.files?.pdf
            nitem.simple = item.files?.pdf_simple
            nitem.directus = '';
            nitem.wp = item.id.toString();
            nitem.typesense = true;
            nitem.rm = item.rm;
            nitem.private = item.private == 1 ? true : false;
            //nitem.draft = item.draft
-            nitem.year = dayjs(item.date).year().toString()
+            //nitem.year = dayjs(item.date).year().toString()
-            nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
+            //nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
            nitem.thumbnail = item.thumbnail
            nitem.slug = item.slug
-            nitem.files = {}
+            //nitem.files = {}
            nitem.files.youtube = item.files?.youtube
            nitem.files.video = item.files?.video
            nitem.files.audio = item.files?.audio
            nitem.files.booklet = item.files?.pdf
            nitem.files.simple = item.files?.pdf_simple
          }
          if (nitem.title && nitem.id) {
            return nitem;
          }
          // if (nitem.title && nitem.id) {
          //   return nitem;
          // }
          return nitem;
        })
        if( nitems.length > 0 ){
        //let csv = parse(nitems);
        // jsonlData += csv
        //let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
        //return csv;
        jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
        //jsonlData += (nitems.filter(item => item !== false))
        //jsonlData += csv;
        //console.log( csv );
        //console.log( documents.length + " documents to write for year " + year + " and type " + type );
-        writeFile(jsonlData, type, year)
+        }
                writeFile(jsonlData, type, year)
      };
    }
    });
@ -226,7 +248,7 @@ for( let year = 1974; year < 2019; year++){
  generateJson( 'conferences', year );
 }
-// for( let year = 2021; year < 2027; year++){
+//  for( let year = 2021; year < 2027; year++){
-//   generateJson( 'activities', year );
+//    generateJson( 'activities', year );
-// //generateJson( 'activities_translations' );
+// // //generateJson( 'activities_translations' );
-// }
+//  }
--- a/carpa_json_to_markdown/package-lock.json
+++ b/carpa_json_to_markdown/package-lock.json
--- a/carpa_json_to_markdown/package.json
+++ b/carpa_json_to_markdown/package.json
@ -9,15 +9,22 @@
  "license": "ISC",
  "description": "",
  "dependencies": {
    "@babel/runtime": "^7.29.2",
    "@xmldom/xmldom": "^0.9.10",
    "cheerio": "^1.2.0",
    "dayjs": "^1.11.19",
    "he": "^1.2.0",
    "js-jsonl": "^1.1.1",
    "jsdom": "^29.1.1",
    "json-to-frontmatter-markdown": "^1.0.0",
    "json-to-jsonl": "^1.1.0",
    "json2csv": "^6.0.0-alpha.2",
    "jsonlines": "^0.1.1",
    "mustache": "^4.2.0",
    "mysql": "^2.18.1",
    "node-fetch": "^3.3.2",
    "request": "^2.88.2",
-    "striptags": "^3.2.0"
+    "striptags": "^3.2.0",
    "typesense": "^3.0.6"
  }
 }