Move culutral groups to regex to fix mis-matches

This commit is contained in:
SpaceMonkey 2025-02-01 00:29:48 +00:00
parent 467535ca84
commit ef85c73afb

View file

@ -1,24 +1,30 @@
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
// Define cultural groups by country // Define cultural groups by country with more precise matching
const culturalGroups = { const culturalGroups = {
anglosphere: [ anglosphere: [
'united kingdom', 'uk', 'britain', 'england', 'scotland', 'wales', 'northern ireland', 'united kingdom', 'uk', 'great britain', 'britain',
'united states', 'usa', 'america', 'canada', 'australia', 'new zealand', 'ireland' 'england', 'scotland', 'wales', 'northern ireland',
'^united states$', '^usa$', '^us$', '^america$',
'^canada$',
'^australia$',
'^new zealand$',
'^ireland$'
], ],
francophone: [ francophone: [
'france', 'belgium', 'switzerland', 'quebec', 'monaco', '^france$', '^belgium$', '^switzerland$', '^quebec$', '^monaco$',
'luxembourg', 'haiti', 'ivory coast', 'senegal', 'cameroon' '^luxembourg$', '^haiti$', '^ivory coast$', '^senegal$', '^cameroon$'
], ],
hispanic: [ hispanic: [
'spain', 'mexico', 'argentina', 'chile', 'colombia', 'peru', '^spain$', '^mexico$', '^argentina$', '^chile$', '^colombia$', '^peru$',
'venezuela', 'ecuador', 'guatemala', 'cuba', 'dominican republic', '^venezuela$', '^ecuador$', '^guatemala$', '^cuba$', '^dominican republic$',
'honduras', 'el salvador', 'nicaragua', 'costa rica', 'panama' '^honduras$', '^el salvador$', '^nicaragua$', '^costa rica$', '^panama$',
'^bolivia$', '^paraguay$', '^uruguay$', 'latin america'
], ],
lusophone: [ lusophone: [
'portugal', 'brazil', 'angola', 'mozambique', '^portugal$', '^brazil$', '^angola$', '^mozambique$',
'cape verde', 'guinea-bissau', 'sao tome and principe' '^cape verde$', '^guinea-bissau$', '^sao tome and principe$'
], ],
arabic: [ arabic: [
'saudi arabia', 'egypt', 'uae', 'united arab emirates', 'qatar', 'saudi arabia', 'egypt', 'uae', 'united arab emirates', 'qatar',
@ -27,54 +33,54 @@ const culturalGroups = {
'algeria', 'morocco', 'sudan' 'algeria', 'morocco', 'sudan'
], ],
germanosphere: [ germanosphere: [
'germany', 'austria', 'switzerland', 'luxembourg', 'liechtenstein' '^germany$', '^austria$', '^switzerland$', '^luxembourg$', '^liechtenstein$'
], ],
slavic: [ slavic: [
'russia', 'ukraine', 'belarus', 'poland', 'czech republic', '^russia$', '^ukraine$', '^belarus$', '^poland$', '^czech republic$',
'slovakia', 'serbia', 'croatia', 'bosnia', 'montenegro', '^slovakia$', '^serbia$', '^croatia$', '^bosnia$', '^montenegro$',
'slovenia', 'bulgaria', 'north macedonia' '^slovenia$', '^bulgaria$', '^north macedonia$'
], ],
sinosphere: [ sinosphere: [
'china', 'hong kong', 'taiwan', 'singapore', 'macau' '^china$', 'hong kong', '^taiwan$', '^singapore$', '^macau$'
], ],
indosphere: [ indosphere: [
'india', 'pakistan', 'bangladesh', 'nepal', 'sri lanka', '^india$', '^pakistan$', '^bangladesh$', '^nepal$', '^sri lanka$',
'bhutan', 'maldives' '^bhutan$', '^maldives$'
], ],
turkic: [ turkic: [
'turkey', 'azerbaijan', 'uzbekistan', 'kazakhstan', '^turkey$', '^azerbaijan$', '^uzbekistan$', '^kazakhstan$',
'kyrgyzstan', 'turkmenistan' '^kyrgyzstan$', '^turkmenistan$'
], ],
nordic: [ nordic: [
'sweden', 'norway', 'denmark', 'finland', 'iceland', '^sweden$', '^norway$', '^denmark$', '^finland$', '^iceland$',
'faroe islands', 'greenland' 'faroe islands', '^greenland$'
], ],
baltic: [ baltic: [
'estonia', 'latvia', 'lithuania' '^estonia$', '^latvia$', '^lithuania$'
], ],
hellenic: [ hellenic: [
'greece', 'cyprus' '^greece$', '^cyprus$'
], ],
benelux: [ benelux: [
'netherlands', 'belgium', 'luxembourg' '^netherlands$', '^belgium$', '^luxembourg$'
], ],
persian: [ persian: [
'iran', 'afghanistan', 'tajikistan' '^iran$', '^afghanistan$', '^tajikistan$'
], ],
malaysphere: [ malaysphere: [
'malaysia', 'brunei', 'indonesia' '^malaysia$', '^brunei$', '^indonesia$'
], ],
korean: [ korean: [
'south korea', 'korea', 'north korea' 'south korea', 'korea', 'north korea'
], ],
japanese: [ japanese: [
'japan' '^japan$'
], ],
vietnamese: [ vietnamese: [
'vietnam' '^vietnam$'
], ],
thai: [ thai: [
'thailand' '^thailand$'
] ]
}; };
@ -85,12 +91,21 @@ function getCulturalGroup(channelInfo) {
// Check if the country belongs to any cultural group // Check if the country belongs to any cultural group
for (const [group, countries] of Object.entries(culturalGroups)) { for (const [group, countries] of Object.entries(culturalGroups)) {
if (countries.some(country => groupTitle.includes(country))) { // Use exact matching with RegExp
if (countries.some(country => {
// If the country pattern starts with ^, use it as a RegExp
if (country.startsWith('^')) {
const regex = new RegExp(country, 'i');
return regex.test(groupTitle);
}
// Otherwise, use includes for flexible matching (for multi-word countries)
return groupTitle.includes(country);
})) {
return group; return group;
} }
} }
return null; // Return null instead of 'other' for non-matching channels return null;
} }
function splitByCulturalGroup(filePath) { function splitByCulturalGroup(filePath) {