Updated batch processes

This commit is contained in:
Julio Ruiz 2026-05-18 13:16:54 -05:00
parent 8ea83f825e
commit e83122945a
3 changed files with 101 additions and 81 deletions

View File

@ -88,6 +88,8 @@ async function createParagraphs(documentId, paragraphs, locale, type) {
//Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes. //Format paragraph here. Check paragraphs and create a notag version of the text for better search results. We can also add a "raw" version of the text with all tags for display purposes.
let fixedHtml = para.trim(); let fixedHtml = para.trim();
if(para == '&nbsp') continue; // Skip paragraphs that are just non-breaking spaces
if(!fixedHtml.startsWith('<p') ) { if(!fixedHtml.startsWith('<p') ) {
// If the paragraph doesn't start with <p>, wrap it in <p> tags // If the paragraph doesn't start with <p>, wrap it in <p> tags
fixedHtml = `<p>${fixedHtml}</p>`; fixedHtml = `<p>${fixedHtml}</p>`;
@ -115,7 +117,7 @@ async function createParagraphs(documentId, paragraphs, locale, type) {
// Main function to create document in Typesense // Main function to create document in Typesense
async function createDocument(data){ async function createDocument(data){
let document = { let document = {
code: data.locale + '-' + dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity, code: dayjs.unix(data.timestamp).format('YYYYMMDD') + '-' + data.activity,
locale: data.locale, locale: data.locale,
id: data.id.toString(), id: data.id.toString(),
type: data.type, type: data.type,
@ -133,17 +135,19 @@ async function createDocument(data){
thumbnail: data.thumbnail, thumbnail: data.thumbnail,
files: { files: {
youtube: data.youtube, youtube: data.youtube,
video: data.files?.videos?.file || null, video: data.video || null,
audio: data.files?.audios?.[0]?.[0]?.file2 || null, audio: data.audio || null,
booklet: data.files?.textos?.[0]?.[1]?.file2 || null, booklet: data.booklet || null,
simple: data.files?.textos?.[0]?.[0]?.file2 || null, simple: data.simple || null,
}, },
directus: "", directus: "",
wp: data.id.toString(), wp: data.id.toString(),
//rm: data.rm, rm: data.rm,
private: false, private: data.private || false,
slug: data.translations?.[0]?.interventions?.[0]?.slug || null, slug: data.slug || null,
body: data.body || null body: "",
year: data.year,
month: data.month,
}; };
let doc = await client.collections('documents').documents().upsert(document); let doc = await client.collections('documents').documents().upsert(document);

View File

@ -2,7 +2,7 @@
//import fetch from 'node-fetch'; // For ES Modules //import fetch from 'node-fetch'; // For ES Modules
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
const { parse } = require('json2csv');
const striptags = require('striptags'); const striptags = require('striptags');
const dayjs = require('dayjs'); const dayjs = require('dayjs');
const he = require('he'); const he = require('he');
@ -18,7 +18,7 @@ let jsonlData = '';
const DATA_INPUT_FOLDER = './input'; const DATA_INPUT_FOLDER = './input';
const DATA_OUTPUT_FOLDER = './output'; const DATA_OUTPUT_FOLDER = './output';
const TEMPLATES_FOLDER = './templates'; const TEMPLATES_FOLDER = './templates';
const LOCALE = 'pt'; const LOCALE = 'es';
function generateMarkdown(type) { function generateMarkdown(type) {
const folderPath = `${DATA_INPUT_FOLDER}/${type}`; const folderPath = `${DATA_INPUT_FOLDER}/${type}`;
@ -95,6 +95,7 @@ async function generateJson( type ) {
let fields = []; let fields = [];
fields['activities_translations'] = [ fields['activities_translations'] = [
'activities_id.id', 'activities_id.id',
'activities_id.wpid',
'title', 'title',
'activities_id.date', 'activities_id.date',
'activities_id.activity', 'activities_id.activity',
@ -150,93 +151,116 @@ async function generateJson( type ) {
let nitem = {} let nitem = {}
if (type == 'activities_translations') { if (type == 'activities_translations') {
nitem.id = item.activities_id?.id nitem.language = LOCALE;
nitem.type = 'activities' //nitem.id = item.activities_id?.id
//nitem.type = 'activities'
nitem.title = item.title nitem.title = item.title
nitem.timestamp = dayjs(item.activities_id?.date).unix()
nitem.date = '';
nitem.activity = item.activities_id?.activity;
nitem.duration = item.activities_id?.duration ?? 0;
//nitem.body = he.decode(striptags(item.interventions[0]?.text)) || '' //nitem.body = he.decode(striptags(item.interventions[0]?.text)) || ''
nitem.body = item.interventions[0]?.text
nitem.private = item.activities_id?.private == 1 ? true : false;
nitem.files = {}
nitem.files.youtube = item.youtube
nitem.files.video = item.privateVideo?.filename_disk
nitem.files.audio = item.mp3?.filename_disk
nitem.files.booklet = item.pdf_booklet?.filename_disk
nitem.files.simple = item.pdf?.filename_disk
nitem.slug = item.slug
nitem.place = item.activities_id?.place || null; nitem.place = item.activities_id?.place || null;
nitem.city = item.activities_id?.city || null; nitem.city = item.activities_id?.city || null;
nitem.state = item.activities_id?.state || null; nitem.state = item.activities_id?.state || null;
nitem.country = item.activities_id?.country || null; nitem.country = item.activities_id?.country || null;
nitem.duration = item.activities_id?.duration ?? 0; nitem.body = (item.interventions[0]?.text?true:false);
nitem.year = dayjs(item.activities_id?.date).year().toString()
nitem.month = nitem.year + " > " + (dayjs(item.activities_id?.date).month() + 1).toString().padStart(2, "0")
nitem.menu_country = nitem.country
nitem.menu_state = nitem.country + " > " + nitem.state
nitem.menu_city = nitem.country + " > " + nitem.state + " > " + nitem.city
nitem.menu_place = nitem.country + " > " + nitem.state + " > " + nitem.city + " > " + nitem.place
nitem.date = dayjs(item.activities_id?.date).unix()
nitem.thumbnail = item.activities_id?.thumbnail?.filename_disk nitem.thumbnail = item.activities_id?.thumbnail?.filename_disk
//nitem.files = {}
nitem.youtube = item.youtube
nitem.video = item.privateVideo?.filename_disk
nitem.audio = item.mp3?.filename_disk
nitem.booklet = item.pdf_booklet?.filename_disk
nitem.simple = item.pdf?.filename_disk
nitem.directus = item.activities_id?.id;
nitem.wp = item.activities_id?.wpid;
nitem.typesense = true;
nitem.private = item.activities_id?.private == 1 ? true : false;
nitem.slug = item.slug
//nitem.year = dayjs(item.activities_id?.date).year().toString()
//nitem.month = nitem.year + " > " + (dayjs(item.activities_id?.date).month() + 1).toString().padStart(2, "0")
// nitem.menu_country = nitem.country
// nitem.menu_state = nitem.country + " > " + nitem.state
// nitem.menu_city = nitem.country + " > " + nitem.state + " > " + nitem.city
// nitem.menu_place = nitem.country + " > " + nitem.state + " > " + nitem.city + " > " + nitem.place
//Filter out anything before 25/12/2021 //Filter out anything before 25/12/2021
if( nitem.date < 1640408400 ) { // if( nitem.date < 1640408400 ) {
return false // return false
} // }
} }
if (type == 'conferences_translations') { if (type == 'conferences_translations') {
nitem.id = item.conferences_id?.id.toString() nitem.language = LOCALE;
nitem.type = 'conferences' //nitem.id = item.conferences_id?.id.toString()
//nitem.type = 'conferences'
nitem.title = item.title nitem.title = item.title
//nitem.body = he.decode(striptags(item.text_published)) || '' //nitem.body = he.decode(striptags(item.text_published)) || ''
nitem.body = item.text_published //nitem.title = item.title
nitem.private = item.conferences_id?.public == 0 ? true : false; nitem.timestamp = dayjs(item.conferences_id?.date).unix()
nitem.files = {} nitem.date = '';
nitem.files.youtube = item.youtube
nitem.files.video = item.privateVideo?.filename_disk nitem.activity = item.conferences_id?.activity;
nitem.files.audio = item.mp3?.filename_disk nitem.duration = item.conferences_id?.duration ?? 0;
nitem.files.booklet = item.pdf_booklet?.filename_disk
nitem.files.simple = item.pdf?.filename_disk //nitem.body = he.decode(striptags(item.interventions[0]?.text)) || ''
nitem.slug = item.slug
nitem.place = item.conferences_id?.place || null; nitem.place = item.conferences_id?.place || null;
nitem.city = item.conferences_id?.city || null; nitem.city = item.conferences_id?.city || null;
nitem.state = item.conferences_id?.state || null; nitem.state = item.conferences_id?.state || null;
nitem.country = item.conferences_id?.country || null; nitem.country = item.conferences_id?.country || null;
nitem.duration = item.conferences_id?.duration ?? 0; nitem.body = (item.text_published?true:false);
nitem.year = dayjs(item.conferences_id?.date).year().toString()
nitem.month = nitem.year + " > " + (dayjs(item.conferences_id?.date).month() + 1).toString().padStart(2, "0")
nitem.menu_country = nitem.country
nitem.menu_state = nitem.country + " > " + nitem.state
nitem.menu_city = nitem.country + " > " + nitem.state + " > " + nitem.city
nitem.menu_place = nitem.country + " > " + nitem.state + " > " + nitem.city + " > " + nitem.place
nitem.date = dayjs(item.conferences_id?.date).unix()
nitem.thumbnail = item.conferences_id?.thumbnail?.filename_disk nitem.thumbnail = item.conferences_id?.thumbnail?.filename_disk
}
if (nitem.title && nitem.id) { //nitem.files = {}
return nitem; nitem.youtube = item.youtube
nitem.video = item.privateVideo?.filename_disk
nitem.audio = item.mp3?.filename_disk
nitem.booklet = item.pdf_booklet?.filename_disk
nitem.simple = item.pdf?.filename_disk
nitem.directus = item.conferences_id?.id.toString()
nitem.wp = '';
nitem.typesense = true;
nitem.private = item.conferences_id?.public == 0 ? true : false;
nitem.slug = item.slug
} }
return nitem;
}) })
jsonlData += nitems.filter(item => item).map(JSON.stringify).join('\n') //jsonlData += nitems.filter(item => item).map(JSON.stringify).join('\n')
//writeFile(jsonlData, type)
console.log()
if( nitems.length > 0 ){
let csv = parse(nitems);
jsonlData += csv
//let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
//return csv;
//jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
//console.log( csv );
//console.log( documents.length + " documents to write for year " + year + " and type " + type );
writeFile(jsonlData, type) writeFile(jsonlData, type)
}
}; };
}); });
} }
function writeFile(jsonlData, type) { function writeFile(jsonlData, type) {
fs.writeFile(`./${DATA_INPUT_FOLDER}/${type}_${LOCALE.toLocaleUpperCase()}.json`, jsonlData, (err) => { fs.writeFile(`./${DATA_INPUT_FOLDER}/${type}_${LOCALE.toLocaleUpperCase()}.csv`, jsonlData, (err) => {
if (err) { if (err) {
console.error("Error writing file:", err); console.error("Error writing file:", err);
return; return;
@ -261,5 +285,5 @@ function writeFile(jsonlData, type) {
//for( let year = 2021; year < 2027; year++){ //for( let year = 2021; year < 2027; year++){
//generateJson( 'activities_translations' ); //generateJson( 'activities_translations' );
generateJson( 'activities_translations' ); generateJson( 'conferences_translations' );
//} //}

View File

@ -192,11 +192,11 @@ async function generateJson( type, year ) {
nitem.typesense = true; nitem.typesense = true;
nitem.rm = item.rm; nitem.rm = item.rm;
nitem.private = item.private == 1 ? true : false; nitem.private = item.private == 1 ? true : false;
//nitem.draft = item.draft nitem.draft = item.draft
//nitem.year = dayjs(item.date).year().toString() nitem.year = dayjs(item.date).year().toString()
//nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0") nitem.month = (dayjs(item.date).month() + 1).toString().padStart(2, "0")
nitem.slug = item.slug nitem.slug = item.slug
//nitem.files = {} nitem.files = {}
} }
// if (nitem.title && nitem.id) { // if (nitem.title && nitem.id) {
@ -206,16 +206,8 @@ async function generateJson( type, year ) {
}) })
if( nitems.length > 0 ){ if( nitems.length > 0 ){
//let csv = parse(nitems);
// jsonlData += csv
//let csv = nitems.map(row => fields.map(fieldName => JSON.stringify(row[fieldName], replacer)).join(','));
//return csv;
jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n') jsonlData += nitems.filter(item => item !== false).map(JSON.stringify).join('\n')
//jsonlData += (nitems.filter(item => item !== false)) jsonlData += '\n';
//jsonlData += csv;
//console.log( csv );
//console.log( documents.length + " documents to write for year " + year + " and type " + type );
} }
writeFile(jsonlData, type, year) writeFile(jsonlData, type, year)
}; };