Updating to export to typesense search engine

2026-05-16 22:00:07 -05:00 · 2026-05-16 22:00:07 -05:00 · 8ea83f825e
parent 2acad0c6c8
commit 8ea83f825e
4 changed files with 1127 additions and 40 deletions
--- a/carpa_json_to_markdown/document_update.mjs
+++ b/carpa_json_to_markdown/document_update.mjs
@ -0,0 +1,153 @@
+import Typesense from "typesense";
+import dayjs from "dayjs";
+import { JSDOM } from "jsdom";
+
+const { window } = new JSDOM();
+
+let client = new Typesense.Client({
+  nodes: [
+    {
+      host: "searchts.carpa.com",
+      port: "443",
+      protocol: "https",
+    }
+  ],
+  apiKey: "3KrmYlcirARCxG4AZPV5bnJgQD0qtoW0",
+});
+
+import fs from 'fs';
+import readline from 'readline';
+import { create } from "domain";
+
+async function processJsonLines(filePath) {
+  const errors = [];
+  const stream = fs.createReadStream(filePath);
+  const rl = readline.createInterface({
+    input: stream,
+    crlfDelay: Infinity,
+  });
+  
+  let lineCount = 0;
+  
+  for await (const line of rl) {
+    lineCount = 0;
+    try {
+      // Parse the line as JSON
+      const data = JSON.parse(line);
+
+      //if( data.id != 62643 ) continue; // Skip until we find the specific ID
+
+      console.log('Processing: ', data.title);
+
+      const doc_id = await createDocument(data);
+
+      console.log('Document created with ID:', doc_id);
+
+      if ('body' in data && typeof data.body === 'string' && hasParagraphs(data.body).length > 0 && data.body.length > 0) {
+        // Split the body into paragraphs using <p> tags
+        const paragraphs = data.body.split(/<\/?p>/).map(p => p.trim()).filter(Boolean);
+        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
+        //console.log('Total paragraphs found:', paragraphs);
+      } else {
+        if( data.body !== undefined ){
+        const paragraphs = data.body.split('\n\n').map(p => p.trim()).filter(Boolean);
+        const totalParagraphs = await createParagraphs(doc_id, paragraphs, data.locale, data.type);
+        //console.log('Total paragraphs found:', paragraphs);
+        }
+      }
+    } catch (error) {
+      //console.error(`Error processing line ${line}:`, error);
+      let data = JSON.parse(line);
+      errors.push({ line: data.id, error: error.message });
+    }
+    //break;
+  }
+  console.log(`Finished processing. Total lines: ${lineCount}. Errors: ${errors.length}`);
+  if (errors.length > 0) {
+    console.log('Errors:', errors);
+  }
+}
+
+function hasParagraphs(htmlString) {
+  const dom = new JSDOM(htmlString);
+  //const doc = parser.parseFromString(htmlString, 'text/html');
+  // Check if any <p> tags exist
+  return dom.window.document.querySelectorAll('p');
+}
+
+//processJsonLines('./input/activities_wp_ES.json');
+processJsonLines('./input/conferences_wp_ES.json');
+
+// Function to create paragraphs in Typesense
+async function createParagraphs(documentId, paragraphs, locale, type) {
+  let lineCount = 0;
+  let jsonl = '';
+  for (const para of paragraphs) {
+    if(!para || para.trim() === '') continue; // Skip empty paragraphs
+
+    //Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
+    let fixedHtml = para.trim();
+
+    if(!fixedHtml.startsWith('<p')){
+      // If the paragraph doesn't start with <p>, wrap it in <p> tags
+      fixedHtml = `<p>${fixedHtml}</p>`;
+    }
+
+    let paragraph = {
+      document_id: documentId,
+      raw: fixedHtml,
+      text: fixedHtml.replace(/<[^>]+>/g, ''), // Remove HTML tags for search
+      number: lineCount + 1,
+      locale: locale,
+      type: type,
+      id: locale + '-' + documentId + '-' + (lineCount + 1) // Unique ID for the paragraph
+    };
+
+    jsonl += JSON.stringify(paragraph) + '\n';
+    
+    lineCount++;
+  }
+
+  let doc = await client.collections('paragraphs').documents().import(jsonl, {action: 'upsert'})
+  return doc; // Return the number of paragraphs created
+}
+
+// Main function to create document in Typesense
+async function createDocument(data){
+  let document = {
+    code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
+    locale: data.locale,
+    id: data.id.toString(),
+    type: data.type,
+    title: data.title,
+    timestamp: data.timestamp,
+    date: data.date,
+    activity: data.activity,
+    duration: data.duration ?? 0,
+    bible_study: data.bible_study,
+    place: data.place || null,
+    city: data.city || null,
+    state: data.state || null,
+    country: data.country || null,
+    draft: data.draft,
+    thumbnail: data.thumbnail,
+    files: {
+      youtube: data.youtube,
+      video: data.files?.videos?.file || null,
+      audio: data.files?.audios?.[0]?.[0]?.file2 || null,
+      booklet: data.files?.textos?.[0]?.[1]?.file2 || null,
+      simple: data.files?.textos?.[0]?.[0]?.file2 || null,  
+    },
+    directus: "",
+    wp: data.id.toString(),
+    //rm: data.rm,
+    private: false,
+    slug: data.translations?.[0]?.interventions?.[0]?.slug || null,
+    body: data.body || null
+  };
+
+  let doc = await client.collections('documents').documents().upsert(document);
+
+  return doc.id;
+    
+}
--- a/carpa_json_to_markdown/index_wp.js
+++ b/carpa_json_to_markdown/index_wp.js
@ -3,6 +3,7 @@
 const fs = require('fs');
 const path = require('path');

+const { parse } = require('json2csv');
 const striptags = require('striptags');
 const dayjs = require('dayjs');
 const he = require('he');
@ -128,73 +129,94 @@ async function generateJson( type, year ) {
          let nitem = {}

          if (type == 'activities') {
+            nitem.locale = LOCALE;
            nitem.id = item.id.toString()
            nitem.type = 'activities'
            nitem.title = item.title
-            nitem.date = dayjs(item.date).unix()
+            nitem.timestamp = dayjs(item.date).unix()
+            nitem.date = item.date
            nitem.activity = parseInt(item.activity)
+            nitem.duration = item.duration ?? 0;
            nitem.bible_study = parseInt(item.bible_study)
            nitem.place = item.place || null;
            nitem.city = item.city || null;
            nitem.state = item.state || null;
            nitem.country = item.country || null;
-            nitem.duration = item.duration ?? 0;
            nitem.body = item.translations?.[0]?.interventions?.[0]?.text
 //            nitem.text = striptags(he.decode(item.translations?.[0]?.interventions?.[0]?.text || ''))
            nitem.draft = item.draft
-            nitem.private = false
-            nitem.year = dayjs(item.date).year().toString()
-            nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
            nitem.thumbnail = item.thumbnail
-            nitem.slug = item.translations[0]?.interventions[0]?.slug
            nitem.files = {}
-            nitem.files.youtube = item.youtube
-            nitem.files.video = item.files?.videos?.file
-            nitem.files.audio = item.files?.audios[0]?.[0]?.file2
-            nitem.files.booklet = item.files?.textos[0]?.[1]?.file2
+            nitem.youtube = item.youtube
+            nitem.video = item.files?.videos?.file
+            nitem.audio = item.files?.audios[0]?.[0]?.file2
+            nitem.booklet = item.files?.textos[0]?.[1]?.file2
+            nitem.simple = item.files?.textos[0]?.[0]?.file2
+            nitem.directus = "";
+            nitem.wp = item.id.toString()
+            nitem.typesense = true;
+            nitem.rm = item.rm;
+            nitem.private = false
+            nitem.slug = item.translations[0]?.interventions[0]?.slug
            
            //Filter out anything before 25/12/2021
-            if( nitem.date < 1640408400 ) {
-              return false
-            }
+            // if( nitem.date < 1640408400 ) {
+            //   return false
+            // }
          }

          if (type == 'conferences') {
+            nitem.locale = LOCALE;
            nitem.id = item.id.toString()
            nitem.type = 'conferences'
            nitem.title = item.title
-            nitem.date = item.timestamp
+            nitem.timestamp = item.timestamp
+            nitem.date = ''
            nitem.activity = parseInt(item.activity)
+            nitem.duration = item.duration ?? 0;
            //nitem.bible_study = parseInt(item.bible_study)
            //nitem.place = item.conferences_id?.place || null;
+            nitem.place = '';
            nitem.city = item.city || null;
            nitem.state = item.state || null;
            nitem.country = item.country || null;
-            nitem.duration = item.duration ?? 0;
-            nitem.body = item.body
+            nitem.body = item.body;
+            nitem.thumbnail = item.thumbnail
+            nitem.youtube = item.files?.youtube
+            nitem.video = item.files?.video
+            nitem.audio = item.files?.audio
+            nitem.booklet = item.files?.pdf
+            nitem.simple = item.files?.pdf_simple
+            nitem.directus = '';
+            nitem.wp = item.id.toString();
+            nitem.typesense = true;
+            nitem.rm = item.rm;
            nitem.private = item.private == 1 ? true : false;
            //nitem.draft = item.draft
-            nitem.year = dayjs(item.date).year().toString()
-            nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
-            nitem.thumbnail = item.thumbnail
+            //nitem.year = dayjs(item.date).year().toString()
+            //nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
            nitem.slug = item.slug
-            nitem.files = {}
-            nitem.files.youtube = item.files?.youtube
-            nitem.files.video = item.files?.video
-            nitem.files.audio = item.files?.audio
-            nitem.files.booklet = item.files?.pdf
-            nitem.files.simple = item.files?.pdf_simple
+            //nitem.files = {}
          }

-          if (nitem.title && nitem.id) {
+          // if (nitem.title && nitem.id) {
+          //   return nitem;
+          // }
          return nitem;
-          }
-
        })

+        if( nitems.length > 0 ){
+        //let csv = parse(nitems);
+        // jsonlData += csv
+        //let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
+        //return csv;
        jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
+        //jsonlData += (nitems.filter(item => item !== false))
+        //jsonlData += csv;

+        //console.log( csv );
        //console.log( documents.length + " documents to write for year " + year + " and type " + type );
+        }
                writeFile(jsonlData, type, year)
      };
    }
@ -228,5 +250,5 @@ for( let year = 1974; year < 2019; year++){

 //  for( let year = 2021; year < 2027; year++){
 //    generateJson( 'activities', year );
-// //generateJson( 'activities_translations' );
+// // //generateJson( 'activities_translations' );
 //  }
--- a/carpa_json_to_markdown/package-lock.json
+++ b/carpa_json_to_markdown/package-lock.json
--- a/carpa_json_to_markdown/package.json
+++ b/carpa_json_to_markdown/package.json
@ -9,15 +9,22 @@
  "license": "ISC",
  "description": "",
  "dependencies": {
+    "@babel/runtime": "^7.29.2",
+    "@xmldom/xmldom": "^0.9.10",
    "cheerio": "^1.2.0",
    "dayjs": "^1.11.19",
    "he": "^1.2.0",
+    "js-jsonl": "^1.1.1",
+    "jsdom": "^29.1.1",
    "json-to-frontmatter-markdown": "^1.0.0",
    "json-to-jsonl": "^1.1.0",
+    "json2csv": "^6.0.0-alpha.2",
+    "jsonlines": "^0.1.1",
    "mustache": "^4.2.0",
    "mysql": "^2.18.1",
    "node-fetch": "^3.3.2",
    "request": "^2.88.2",
-    "striptags": "^3.2.0"
+    "striptags": "^3.2.0",
+    "typesense": "^3.0.6"
  }
 }