// Resolve earliest release year for songs in data/ using MusicBrainz // Usage: node scripts/resolve-years.js [--max N] [--force] // Respects MusicBrainz 1 req/sec guideline and caches results. import fs from 'fs'; import fsp from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import { parseFile as mmParseFile } from 'music-metadata'; import { setTimeout as wait } from 'timers/promises'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const ROOT = path.resolve(__dirname, '..'); const DATA_DIR = path.join(ROOT, 'data'); const OUT_JSON = path.join(DATA_DIR, 'years.json'); const CACHE_JSON = path.join(DATA_DIR, '.mb_cache.json'); const CONTACT = process.env.MB_CONTACT || 'local'; const USER_AGENT = `hitstar-years/0.1.0 (${CONTACT})`; const args = new Set(process.argv.slice(2)); function getArgValue(name, defVal) { const i = process.argv.findIndex((a) => a === name || a.startsWith(name + '=')); if (i === -1) return defVal; const a = process.argv[i]; if (a.includes('=')) return a.split('=')[1]; return process.argv[i + 1] && !process.argv[i + 1].startsWith('--') ? process.argv[i + 1] : defVal; } const MAX = parseInt(getArgValue('--max', '0'), 10) || 0; const FORCE = args.has('--force'); const FILE_FILTER = getArgValue('--file', '').toLowerCase(); function normalize(str) { if (!str) return ''; let s = String(str) .replace(/\s*\([^)]*(feat\.|ft\.|featuring)[^)]*\)/gi, '') // remove (feat. ...) .replace(/\s*\[(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\]/gi, '') .replace(/\s*-\s*(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\b/gi, '') .replace(/\s*\((?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version|short mix|original mix|201\d remaster|20\d\d remaster)\)/gi, '') .replace(/\s*&\s*/g, ' and ') .replace(/\s+feat\.?\s+/gi, ' ') .replace(/\s+ft\.?\s+/gi, ' ') .replace(/[“”]/g, '"') .replace(/[’‘']/g, "'") .replace(/[^a-z0-9'"\s]/gi, ' ') .replace(/\s+/g, ' ') .trim() .toLowerCase(); // remove trailing quotes or hyphens // trim leading/trailing dashes/spaces s = s.replace(/^[-\s]+/, '').replace(/[-\s]+$/, ''); return s; } function parseFromFilename(file) { const base = path.parse(file).name; const m = base.match(/^(.*?)\s+-\s+(.*)$/); // Artist - Title if (m) { return { artist: m[1].trim(), title: m[2].trim() }; } return { artist: '', title: base }; } async function getMeta(fp) { try { const meta = await mmParseFile(fp, { duration: true }); return { title: meta.common.title || '', artist: meta.common.artist || '', durationMs: Number.isFinite(meta.format.duration) ? Math.round(meta.format.duration * 1000) : null, yearTag: meta.common.year || null, }; } catch { return { title: '', artist: '', durationMs: null, yearTag: null }; } } async function readCache() { try { const j = JSON.parse(await fsp.readFile(CACHE_JSON, 'utf8')); return j || {}; } catch { return {}; } } async function writeCache(cache) { await fsp.writeFile(CACHE_JSON, JSON.stringify(cache, null, 2)); } function similar(a, b) { a = normalize(a); b = normalize(b); if (!a || !b) return 0; if (a === b) return 1; // simple token overlap Jaccard const as = new Set(a.split(' ')); const bs = new Set(b.split(' ')); const inter = [...as].filter((x) => bs.has(x)).length; const union = new Set([...as, ...bs]).size; return inter / union; } async function mbFetchJson(url, retries = 3) { for (let i = 0; i < retries; i++) { const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'application/json' } }); if (res.status === 503 || res.status === 429) { const ra = Number(res.headers.get('Retry-After')) || 2; await wait(ra * 1000); continue; } if (!res.ok) { const text = await res.text().catch(() => ''); throw new Error(`HTTP ${res.status} ${res.statusText} - ${text}`); } return res.json(); } throw new Error('Failed after retries'); } async function searchRecording(artist, title) { const q = `recording:"${title}" AND artist:"${artist}"`; const url = `https://musicbrainz.org/ws/2/recording?fmt=json&limit=25&query=${encodeURIComponent(q)}`; const json = await mbFetchJson(url); await wait(1300); // rate limit return json.recordings || []; } async function getRecordingDetails(mbid) { const url = `https://musicbrainz.org/ws/2/recording/${encodeURIComponent(mbid)}?fmt=json&inc=releases+artist-credits`; const json = await mbFetchJson(url); await wait(1300); // rate limit return json; } function pickBestRecording(candidates, artist, title, durationMs) { const nArtist = normalize(artist); const nTitle = normalize(title); let best = null; let bestScore = -Infinity; const viable = []; for (const r of candidates) { const rTitle = r.title || ''; const rArtists = (r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '); const titleSim = similar(rTitle, nTitle); const artistSim = similar(rArtists, nArtist); let score = (r.score || 0) / 100 + titleSim * 1.5 + artistSim * 1.2; if (durationMs && r.length) { const diff = Math.abs(r.length - durationMs); const durScore = Math.max(0, 1 - Math.min(diff, 15000) / 15000); // within 15s window score += durScore * 0.8; } // Prefer those with more releases (more evidence) if (Array.isArray(r.releases)) score += Math.min(5, r.releases.length) * 0.05; const firstYear = parseDateToYear(r['first-release-date']); if (firstYear) { // Tiny bias towards older original releases const ageBias = Math.max(0, 2100 - firstYear) / 2100; // ~0.5 for 1050, ~0.95 for 100 score += ageBias * 0.3; } if (score > bestScore) { bestScore = score; best = r; } if (titleSim >= 0.55 && artistSim >= 0.55) { viable.push({ r, firstYear: firstYear || null, titleSim, artistSim, score }); } } // Among viable matches, prefer the one with the earliest known first release year const withYear = viable.filter((v) => v.firstYear); if (withYear.length) { withYear.sort((a, b) => a.firstYear - b.firstYear || b.score - a.score); return withYear[0].r; } return best; } function parseDateToYear(dateStr) { if (!dateStr) return null; const re = /^(\d{4})/; const m = re.exec(String(dateStr)); return m ? Number(m[1]) : null; } function earliestDate(dates) { const valid = dates.filter(Boolean).map((d) => ({ d, y: parseDateToYear(d) })).filter((x) => x.y); if (!valid.length) return { date: null, year: null }; valid.sort((a, b) => { if (a.d < b.d) return -1; if (a.d > b.d) return 1; return 0; }); return { date: valid[0].d, year: valid[0].y }; } async function resolveOne(file, meta, cache) { const key = `${normalize(meta.artist)}|${normalize(meta.title)}`; if (!FORCE && cache[key]) return { ...cache[key], fromCache: true }; if (!meta.artist || !meta.title) throw new Error('Missing artist/title'); const recs = await searchRecording(meta.artist, meta.title); if (!recs.length) throw new Error('No recordings found'); const best = pickBestRecording(recs, meta.artist, meta.title, meta.durationMs); if (!best) throw new Error('No suitable match'); let firstDate = best['first-release-date'] || null; let year = parseDateToYear(firstDate); // If no year on best, or if best appears to be a later reissue, inspect more candidates const viable = recs .map((r) => ({ r, titleSim: similar(r.title || '', meta.title), artistSim: similar((r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist), firstYear: parseDateToYear(r['first-release-date']) || null, })) .filter((v) => v.titleSim >= 0.5 && v.artistSim >= 0.5); // Determine earliest among top candidates, fetching details when missing let earliest = { year: year || Infinity, date: firstDate || null, id: best.id }; const detailsBudget = 5; // limit extra calls per track let detailsUsed = 0; for (const v of viable.slice(0, 10)) { let y = v.firstYear; let d = v.r['first-release-date'] || null; if (!y && detailsUsed < detailsBudget) { try { const details = await getRecordingDetails(v.r.id); detailsUsed++; const dates = (details.releases || []).map((re) => re.date || re['release-events']?.[0]?.date || null); const er = earliestDate(dates); y = er.year; d = er.date; } catch {} } if (y && y < (earliest.year || Infinity)) { earliest = { year: y, date: d, id: v.r.id }; } } if (earliest.year && earliest.year !== year) { year = earliest.year; firstDate = earliest.date; } const result = { file, title: meta.title, artist: meta.artist, mbid: earliest.id || best.id, earliestDate: firstDate, year, confidence: { mbScore: best.score || null, titleSim: similar(best.title || '', meta.title), artistSim: similar((best['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist), durationMs: meta.durationMs, matchedDurationMs: best.length || null, }, }; cache[key] = result; return result; } async function main() { console.log('Scanning data dir:', DATA_DIR); let files = fs.readdirSync(DATA_DIR).filter((f) => /\.(mp3|wav|m4a|ogg)$/i.test(f)); if (FILE_FILTER) { files = files.filter((f) => f.toLowerCase().includes(FILE_FILTER)); } if (!files.length) { console.error('No audio files in data/.'); process.exit(1); } const cache = await readCache(); const results = []; let count = 0; for (const f of files) { if (MAX && count >= MAX) break; const fp = path.join(DATA_DIR, f); const fromName = parseFromFilename(f); const tags = await getMeta(fp); const artist = tags.artist || fromName.artist; const title = tags.title || fromName.title; const meta = { artist, title, durationMs: tags.durationMs }; count++; console.log(`\n[${count}/${MAX || files.length}] ${f}`); console.log(` -> ${artist} — ${title}`); try { const r = await resolveOne(f, meta, cache); results.push(r); console.log(` ✓ Earliest: ${r.earliestDate || 'n/a'} (year=${r.year || 'n/a'}) [${r.fromCache ? 'cache' : 'MB'}]`); } catch (e) { console.warn(' ! Failed:', e.message); results.push({ file: f, title, artist, mbid: null, earliestDate: null, year: null, error: e.message }); } } // Build index by file const byFile = Object.fromEntries(results.map((r) => [r.file, { year: r.year, date: r.earliestDate, title: r.title, artist: r.artist, mbid: r.mbid }])); const out = { generatedAt: new Date().toISOString(), total: results.length, byFile, results }; await fsp.writeFile(OUT_JSON, JSON.stringify(out, null, 2)); await writeCache(cache); console.log(`\nWritten ${OUT_JSON} with ${results.length} entries.`); console.log('Cache saved:', path.basename(CACHE_JSON)); } // Ensure fetch exists in Node <18 if (typeof fetch === 'undefined') { const { default: undici } = await import('undici'); global.fetch = undici.fetch; } main().catch((e) => { console.error(e); process.exit(1); });