Files
hitstar/scripts/resolve-years.js
Elmar Kresse 8c5ca0044f
All checks were successful
Build and Push Docker Image / docker (push) Successful in 21s
Refactor code structure for improved readability and maintainability
2025-09-04 21:53:54 +02:00

366 lines
12 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Resolve earliest release year for songs in data/ using MusicBrainz
// Usage: node scripts/resolve-years.js [--max N] [--force]
// Respects MusicBrainz 1 req/sec guideline and caches results.
import fs from 'fs';
import fsp from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { parseFile as mmParseFile } from 'music-metadata';
import { setTimeout as wait } from 'timers/promises';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const ROOT = path.resolve(__dirname, '..');
const DATA_DIR = path.join(ROOT, 'data');
const OUT_JSON = path.join(DATA_DIR, 'years.json');
const CACHE_JSON = path.join(DATA_DIR, '.mb_cache.json');
const CONTACT = process.env.MB_CONTACT || 'local';
const USER_AGENT = `hitstar-years/0.1.0 (${CONTACT})`;
const args = new Set(process.argv.slice(2));
function getArgValue(name, defVal) {
const i = process.argv.findIndex((a) => a === name || a.startsWith(name + '='));
if (i === -1) return defVal;
const a = process.argv[i];
if (a.includes('=')) return a.split('=')[1];
return process.argv[i + 1] && !process.argv[i + 1].startsWith('--')
? process.argv[i + 1]
: defVal;
}
const MAX = parseInt(getArgValue('--max', '0'), 10) || 0;
const FORCE = args.has('--force');
const FILE_FILTER = getArgValue('--file', '').toLowerCase();
function normalize(str) {
if (!str) return '';
let s = String(str)
.replace(/\s*\([^)]*(feat\.|ft\.|featuring)[^)]*\)/gi, '') // remove (feat. ...)
.replace(
/\s*\[(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\]/gi,
''
)
.replace(
/\s*-\s*(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\b/gi,
''
)
.replace(
/\s*\((?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version|short mix|original mix|201\d remaster|20\d\d remaster)\)/gi,
''
)
.replace(/\s*&\s*/g, ' and ')
.replace(/\s+feat\.?\s+/gi, ' ')
.replace(/\s+ft\.?\s+/gi, ' ')
.replace(/[“”]/g, '"')
.replace(/[']/g, "'")
.replace(/[^a-z0-9'"\s]/gi, ' ')
.replace(/\s+/g, ' ')
.trim()
.toLowerCase();
// remove trailing quotes or hyphens
// trim leading/trailing dashes/spaces
s = s.replace(/^[-\s]+/, '').replace(/[-\s]+$/, '');
return s;
}
function parseFromFilename(file) {
const base = path.parse(file).name;
const m = base.match(/^(.*?)\s+-\s+(.*)$/); // Artist - Title
if (m) {
return { artist: m[1].trim(), title: m[2].trim() };
}
return { artist: '', title: base };
}
async function getMeta(fp) {
try {
const meta = await mmParseFile(fp, { duration: true });
return {
title: meta.common.title || '',
artist: meta.common.artist || '',
durationMs: Number.isFinite(meta.format.duration)
? Math.round(meta.format.duration * 1000)
: null,
yearTag: meta.common.year || null,
};
} catch {
return { title: '', artist: '', durationMs: null, yearTag: null };
}
}
async function readCache() {
try {
const j = JSON.parse(await fsp.readFile(CACHE_JSON, 'utf8'));
return j || {};
} catch {
return {};
}
}
async function writeCache(cache) {
await fsp.writeFile(CACHE_JSON, JSON.stringify(cache, null, 2));
}
function similar(a, b) {
a = normalize(a);
b = normalize(b);
if (!a || !b) return 0;
if (a === b) return 1;
// simple token overlap Jaccard
const as = new Set(a.split(' '));
const bs = new Set(b.split(' '));
const inter = [...as].filter((x) => bs.has(x)).length;
const union = new Set([...as, ...bs]).size;
return inter / union;
}
async function mbFetchJson(url, retries = 3) {
for (let i = 0; i < retries; i++) {
const res = await fetch(url, {
headers: { 'User-Agent': USER_AGENT, Accept: 'application/json' },
});
if (res.status === 503 || res.status === 429) {
const ra = Number(res.headers.get('Retry-After')) || 2;
await wait(ra * 1000);
continue;
}
if (!res.ok) {
const text = await res.text().catch(() => '');
throw new Error(`HTTP ${res.status} ${res.statusText} - ${text}`);
}
return res.json();
}
throw new Error('Failed after retries');
}
async function searchRecording(artist, title) {
const q = `recording:"${title}" AND artist:"${artist}"`;
const url = `https://musicbrainz.org/ws/2/recording?fmt=json&limit=25&query=${encodeURIComponent(q)}`;
const json = await mbFetchJson(url);
await wait(1300); // rate limit
return json.recordings || [];
}
async function getRecordingDetails(mbid) {
const url = `https://musicbrainz.org/ws/2/recording/${encodeURIComponent(mbid)}?fmt=json&inc=releases+artist-credits`;
const json = await mbFetchJson(url);
await wait(1300); // rate limit
return json;
}
function pickBestRecording(candidates, artist, title, durationMs) {
const nArtist = normalize(artist);
const nTitle = normalize(title);
let best = null;
let bestScore = -Infinity;
const viable = [];
for (const r of candidates) {
const rTitle = r.title || '';
const rArtists = (r['artist-credit'] || [])
.map((ac) => ac.name || ac.artist?.name)
.filter(Boolean)
.join(' ');
const titleSim = similar(rTitle, nTitle);
const artistSim = similar(rArtists, nArtist);
let score = (r.score || 0) / 100 + titleSim * 1.5 + artistSim * 1.2;
if (durationMs && r.length) {
const diff = Math.abs(r.length - durationMs);
const durScore = Math.max(0, 1 - Math.min(diff, 15000) / 15000); // within 15s window
score += durScore * 0.8;
}
// Prefer those with more releases (more evidence)
if (Array.isArray(r.releases)) score += Math.min(5, r.releases.length) * 0.05;
const firstYear = parseDateToYear(r['first-release-date']);
if (firstYear) {
// Tiny bias towards older original releases
const ageBias = Math.max(0, 2100 - firstYear) / 2100; // ~0.5 for 1050, ~0.95 for 100
score += ageBias * 0.3;
}
if (score > bestScore) {
bestScore = score;
best = r;
}
if (titleSim >= 0.55 && artistSim >= 0.55) {
viable.push({ r, firstYear: firstYear || null, titleSim, artistSim, score });
}
}
// Among viable matches, prefer the one with the earliest known first release year
const withYear = viable.filter((v) => v.firstYear);
if (withYear.length) {
withYear.sort((a, b) => a.firstYear - b.firstYear || b.score - a.score);
return withYear[0].r;
}
return best;
}
function parseDateToYear(dateStr) {
if (!dateStr) return null;
const re = /^(\d{4})/;
const m = re.exec(String(dateStr));
return m ? Number(m[1]) : null;
}
function earliestDate(dates) {
const valid = dates
.filter(Boolean)
.map((d) => ({ d, y: parseDateToYear(d) }))
.filter((x) => x.y);
if (!valid.length) return { date: null, year: null };
valid.sort((a, b) => {
if (a.d < b.d) return -1;
if (a.d > b.d) return 1;
return 0;
});
return { date: valid[0].d, year: valid[0].y };
}
async function resolveOne(file, meta, cache) {
const key = `${normalize(meta.artist)}|${normalize(meta.title)}`;
if (!FORCE && cache[key]) return { ...cache[key], fromCache: true };
if (!meta.artist || !meta.title) throw new Error('Missing artist/title');
const recs = await searchRecording(meta.artist, meta.title);
if (!recs.length) throw new Error('No recordings found');
const best = pickBestRecording(recs, meta.artist, meta.title, meta.durationMs);
if (!best) throw new Error('No suitable match');
let firstDate = best['first-release-date'] || null;
let year = parseDateToYear(firstDate);
// If no year on best, or if best appears to be a later reissue, inspect more candidates
const viable = recs
.map((r) => ({
r,
titleSim: similar(r.title || '', meta.title),
artistSim: similar(
(r['artist-credit'] || [])
.map((ac) => ac.name || ac.artist?.name)
.filter(Boolean)
.join(' '),
meta.artist
),
firstYear: parseDateToYear(r['first-release-date']) || null,
}))
.filter((v) => v.titleSim >= 0.5 && v.artistSim >= 0.5);
// Determine earliest among top candidates, fetching details when missing
let earliest = { year: year || Infinity, date: firstDate || null, id: best.id };
const detailsBudget = 5; // limit extra calls per track
let detailsUsed = 0;
for (const v of viable.slice(0, 10)) {
let y = v.firstYear;
let d = v.r['first-release-date'] || null;
if (!y && detailsUsed < detailsBudget) {
try {
const details = await getRecordingDetails(v.r.id);
detailsUsed++;
const dates = (details.releases || []).map(
(re) => re.date || re['release-events']?.[0]?.date || null
);
const er = earliestDate(dates);
y = er.year;
d = er.date;
} catch {}
}
if (y && y < (earliest.year || Infinity)) {
earliest = { year: y, date: d, id: v.r.id };
}
}
if (earliest.year && earliest.year !== year) {
year = earliest.year;
firstDate = earliest.date;
}
const result = {
file,
title: meta.title,
artist: meta.artist,
mbid: earliest.id || best.id,
earliestDate: firstDate,
year,
confidence: {
mbScore: best.score || null,
titleSim: similar(best.title || '', meta.title),
artistSim: similar(
(best['artist-credit'] || [])
.map((ac) => ac.name || ac.artist?.name)
.filter(Boolean)
.join(' '),
meta.artist
),
durationMs: meta.durationMs,
matchedDurationMs: best.length || null,
},
};
cache[key] = result;
return result;
}
async function main() {
console.log('Scanning data dir:', DATA_DIR);
let files = fs.readdirSync(DATA_DIR).filter((f) => /\.(mp3|wav|m4a|ogg)$/i.test(f));
if (FILE_FILTER) {
files = files.filter((f) => f.toLowerCase().includes(FILE_FILTER));
}
if (!files.length) {
console.error('No audio files in data/.');
process.exit(1);
}
const cache = await readCache();
const results = [];
let count = 0;
for (const f of files) {
if (MAX && count >= MAX) break;
const fp = path.join(DATA_DIR, f);
const fromName = parseFromFilename(f);
const tags = await getMeta(fp);
const artist = tags.artist || fromName.artist;
const title = tags.title || fromName.title;
const meta = { artist, title, durationMs: tags.durationMs };
count++;
console.log(`\n[${count}/${MAX || files.length}] ${f}`);
console.log(` -> ${artist}${title}`);
try {
const r = await resolveOne(f, meta, cache);
results.push(r);
console.log(
` ✓ Earliest: ${r.earliestDate || 'n/a'} (year=${r.year || 'n/a'}) [${r.fromCache ? 'cache' : 'MB'}]`
);
} catch (e) {
console.warn(' ! Failed:', e.message);
results.push({
file: f,
title,
artist,
mbid: null,
earliestDate: null,
year: null,
error: e.message,
});
}
}
// Build index by file
const byFile = Object.fromEntries(
results.map((r) => [
r.file,
{ year: r.year, date: r.earliestDate, title: r.title, artist: r.artist, mbid: r.mbid },
])
);
const out = { generatedAt: new Date().toISOString(), total: results.length, byFile, results };
await fsp.writeFile(OUT_JSON, JSON.stringify(out, null, 2));
await writeCache(cache);
console.log(`\nWritten ${OUT_JSON} with ${results.length} entries.`);
console.log('Cache saved:', path.basename(CACHE_JSON));
}
// Ensure fetch exists in Node <18
if (typeof fetch === 'undefined') {
const { default: undici } = await import('undici');
global.fetch = undici.fetch;
}
main().catch((e) => {
console.error(e);
process.exit(1);
});