init commit
This commit is contained in:
309
scripts/resolve-years.js
Normal file
309
scripts/resolve-years.js
Normal file
@@ -0,0 +1,309 @@
|
||||
// Resolve earliest release year for songs in data/ using MusicBrainz
|
||||
// Usage: node scripts/resolve-years.js [--max N] [--force]
|
||||
// Respects MusicBrainz 1 req/sec guideline and caches results.
|
||||
|
||||
import fs from 'fs';
|
||||
import fsp from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { parseFile as mmParseFile } from 'music-metadata';
|
||||
import { setTimeout as wait } from 'timers/promises';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
const ROOT = path.resolve(__dirname, '..');
|
||||
const DATA_DIR = path.join(ROOT, 'data');
|
||||
const OUT_JSON = path.join(DATA_DIR, 'years.json');
|
||||
const CACHE_JSON = path.join(DATA_DIR, '.mb_cache.json');
|
||||
|
||||
const CONTACT = process.env.MB_CONTACT || 'local';
|
||||
const USER_AGENT = `hitstar-years/0.1.0 (${CONTACT})`;
|
||||
|
||||
const args = new Set(process.argv.slice(2));
|
||||
function getArgValue(name, defVal) {
|
||||
const i = process.argv.findIndex((a) => a === name || a.startsWith(name + '='));
|
||||
if (i === -1) return defVal;
|
||||
const a = process.argv[i];
|
||||
if (a.includes('=')) return a.split('=')[1];
|
||||
return process.argv[i + 1] && !process.argv[i + 1].startsWith('--') ? process.argv[i + 1] : defVal;
|
||||
}
|
||||
const MAX = parseInt(getArgValue('--max', '0'), 10) || 0;
|
||||
const FORCE = args.has('--force');
|
||||
const FILE_FILTER = getArgValue('--file', '').toLowerCase();
|
||||
|
||||
function normalize(str) {
|
||||
if (!str) return '';
|
||||
let s = String(str)
|
||||
.replace(/\s*\([^)]*(feat\.|ft\.|featuring)[^)]*\)/gi, '') // remove (feat. ...)
|
||||
.replace(/\s*\[(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\]/gi, '')
|
||||
.replace(/\s*-\s*(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\b/gi, '')
|
||||
.replace(/\s*\((?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version|short mix|original mix|201\d remaster|20\d\d remaster)\)/gi, '')
|
||||
.replace(/\s*&\s*/g, ' and ')
|
||||
.replace(/\s+feat\.?\s+/gi, ' ')
|
||||
.replace(/\s+ft\.?\s+/gi, ' ')
|
||||
.replace(/[“”]/g, '"')
|
||||
.replace(/[’‘']/g, "'")
|
||||
.replace(/[^a-z0-9'"\s]/gi, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
// remove trailing quotes or hyphens
|
||||
// trim leading/trailing dashes/spaces
|
||||
s = s.replace(/^[-\s]+/, '').replace(/[-\s]+$/, '');
|
||||
return s;
|
||||
}
|
||||
|
||||
function parseFromFilename(file) {
|
||||
const base = path.parse(file).name;
|
||||
const m = base.match(/^(.*?)\s+-\s+(.*)$/); // Artist - Title
|
||||
if (m) {
|
||||
return { artist: m[1].trim(), title: m[2].trim() };
|
||||
}
|
||||
return { artist: '', title: base };
|
||||
}
|
||||
|
||||
async function getMeta(fp) {
|
||||
try {
|
||||
const meta = await mmParseFile(fp, { duration: true });
|
||||
return {
|
||||
title: meta.common.title || '',
|
||||
artist: meta.common.artist || '',
|
||||
durationMs: Number.isFinite(meta.format.duration) ? Math.round(meta.format.duration * 1000) : null,
|
||||
yearTag: meta.common.year || null,
|
||||
};
|
||||
} catch {
|
||||
return { title: '', artist: '', durationMs: null, yearTag: null };
|
||||
}
|
||||
}
|
||||
|
||||
async function readCache() {
|
||||
try {
|
||||
const j = JSON.parse(await fsp.readFile(CACHE_JSON, 'utf8'));
|
||||
return j || {};
|
||||
} catch { return {}; }
|
||||
}
|
||||
|
||||
async function writeCache(cache) {
|
||||
await fsp.writeFile(CACHE_JSON, JSON.stringify(cache, null, 2));
|
||||
}
|
||||
|
||||
function similar(a, b) {
|
||||
a = normalize(a); b = normalize(b);
|
||||
if (!a || !b) return 0;
|
||||
if (a === b) return 1;
|
||||
// simple token overlap Jaccard
|
||||
const as = new Set(a.split(' '));
|
||||
const bs = new Set(b.split(' '));
|
||||
const inter = [...as].filter((x) => bs.has(x)).length;
|
||||
const union = new Set([...as, ...bs]).size;
|
||||
return inter / union;
|
||||
}
|
||||
|
||||
async function mbFetchJson(url, retries = 3) {
|
||||
for (let i = 0; i < retries; i++) {
|
||||
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'application/json' } });
|
||||
if (res.status === 503 || res.status === 429) {
|
||||
const ra = Number(res.headers.get('Retry-After')) || 2;
|
||||
await wait(ra * 1000);
|
||||
continue;
|
||||
}
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => '');
|
||||
throw new Error(`HTTP ${res.status} ${res.statusText} - ${text}`);
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
throw new Error('Failed after retries');
|
||||
}
|
||||
|
||||
async function searchRecording(artist, title) {
|
||||
const q = `recording:"${title}" AND artist:"${artist}"`;
|
||||
const url = `https://musicbrainz.org/ws/2/recording?fmt=json&limit=25&query=${encodeURIComponent(q)}`;
|
||||
const json = await mbFetchJson(url);
|
||||
await wait(1300); // rate limit
|
||||
return json.recordings || [];
|
||||
}
|
||||
|
||||
async function getRecordingDetails(mbid) {
|
||||
const url = `https://musicbrainz.org/ws/2/recording/${encodeURIComponent(mbid)}?fmt=json&inc=releases+artist-credits`;
|
||||
const json = await mbFetchJson(url);
|
||||
await wait(1300); // rate limit
|
||||
return json;
|
||||
}
|
||||
|
||||
function pickBestRecording(candidates, artist, title, durationMs) {
|
||||
const nArtist = normalize(artist);
|
||||
const nTitle = normalize(title);
|
||||
let best = null;
|
||||
let bestScore = -Infinity;
|
||||
const viable = [];
|
||||
for (const r of candidates) {
|
||||
const rTitle = r.title || '';
|
||||
const rArtists = (r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' ');
|
||||
const titleSim = similar(rTitle, nTitle);
|
||||
const artistSim = similar(rArtists, nArtist);
|
||||
let score = (r.score || 0) / 100 + titleSim * 1.5 + artistSim * 1.2;
|
||||
if (durationMs && r.length) {
|
||||
const diff = Math.abs(r.length - durationMs);
|
||||
const durScore = Math.max(0, 1 - Math.min(diff, 15000) / 15000); // within 15s window
|
||||
score += durScore * 0.8;
|
||||
}
|
||||
// Prefer those with more releases (more evidence)
|
||||
if (Array.isArray(r.releases)) score += Math.min(5, r.releases.length) * 0.05;
|
||||
const firstYear = parseDateToYear(r['first-release-date']);
|
||||
if (firstYear) {
|
||||
// Tiny bias towards older original releases
|
||||
const ageBias = Math.max(0, 2100 - firstYear) / 2100; // ~0.5 for 1050, ~0.95 for 100
|
||||
score += ageBias * 0.3;
|
||||
}
|
||||
if (score > bestScore) { bestScore = score; best = r; }
|
||||
if (titleSim >= 0.55 && artistSim >= 0.55) {
|
||||
viable.push({ r, firstYear: firstYear || null, titleSim, artistSim, score });
|
||||
}
|
||||
}
|
||||
// Among viable matches, prefer the one with the earliest known first release year
|
||||
const withYear = viable.filter((v) => v.firstYear);
|
||||
if (withYear.length) {
|
||||
withYear.sort((a, b) => a.firstYear - b.firstYear || b.score - a.score);
|
||||
return withYear[0].r;
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
function parseDateToYear(dateStr) {
|
||||
if (!dateStr) return null;
|
||||
const re = /^(\d{4})/;
|
||||
const m = re.exec(String(dateStr));
|
||||
return m ? Number(m[1]) : null;
|
||||
}
|
||||
|
||||
function earliestDate(dates) {
|
||||
const valid = dates.filter(Boolean).map((d) => ({ d, y: parseDateToYear(d) })).filter((x) => x.y);
|
||||
if (!valid.length) return { date: null, year: null };
|
||||
valid.sort((a, b) => {
|
||||
if (a.d < b.d) return -1;
|
||||
if (a.d > b.d) return 1;
|
||||
return 0;
|
||||
});
|
||||
return { date: valid[0].d, year: valid[0].y };
|
||||
}
|
||||
|
||||
async function resolveOne(file, meta, cache) {
|
||||
const key = `${normalize(meta.artist)}|${normalize(meta.title)}`;
|
||||
if (!FORCE && cache[key]) return { ...cache[key], fromCache: true };
|
||||
if (!meta.artist || !meta.title) throw new Error('Missing artist/title');
|
||||
|
||||
const recs = await searchRecording(meta.artist, meta.title);
|
||||
if (!recs.length) throw new Error('No recordings found');
|
||||
const best = pickBestRecording(recs, meta.artist, meta.title, meta.durationMs);
|
||||
if (!best) throw new Error('No suitable match');
|
||||
let firstDate = best['first-release-date'] || null;
|
||||
let year = parseDateToYear(firstDate);
|
||||
// If no year on best, or if best appears to be a later reissue, inspect more candidates
|
||||
const viable = recs
|
||||
.map((r) => ({
|
||||
r,
|
||||
titleSim: similar(r.title || '', meta.title),
|
||||
artistSim: similar((r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist),
|
||||
firstYear: parseDateToYear(r['first-release-date']) || null,
|
||||
}))
|
||||
.filter((v) => v.titleSim >= 0.5 && v.artistSim >= 0.5);
|
||||
|
||||
// Determine earliest among top candidates, fetching details when missing
|
||||
let earliest = { year: year || Infinity, date: firstDate || null, id: best.id };
|
||||
const detailsBudget = 5; // limit extra calls per track
|
||||
let detailsUsed = 0;
|
||||
for (const v of viable.slice(0, 10)) {
|
||||
let y = v.firstYear;
|
||||
let d = v.r['first-release-date'] || null;
|
||||
if (!y && detailsUsed < detailsBudget) {
|
||||
try {
|
||||
const details = await getRecordingDetails(v.r.id);
|
||||
detailsUsed++;
|
||||
const dates = (details.releases || []).map((re) => re.date || re['release-events']?.[0]?.date || null);
|
||||
const er = earliestDate(dates);
|
||||
y = er.year;
|
||||
d = er.date;
|
||||
} catch {}
|
||||
}
|
||||
if (y && y < (earliest.year || Infinity)) {
|
||||
earliest = { year: y, date: d, id: v.r.id };
|
||||
}
|
||||
}
|
||||
if (earliest.year && earliest.year !== year) {
|
||||
year = earliest.year;
|
||||
firstDate = earliest.date;
|
||||
}
|
||||
const result = {
|
||||
file,
|
||||
title: meta.title,
|
||||
artist: meta.artist,
|
||||
mbid: earliest.id || best.id,
|
||||
earliestDate: firstDate,
|
||||
year,
|
||||
confidence: {
|
||||
mbScore: best.score || null,
|
||||
titleSim: similar(best.title || '', meta.title),
|
||||
artistSim: similar((best['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist),
|
||||
durationMs: meta.durationMs,
|
||||
matchedDurationMs: best.length || null,
|
||||
},
|
||||
};
|
||||
cache[key] = result;
|
||||
return result;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('Scanning data dir:', DATA_DIR);
|
||||
let files = fs.readdirSync(DATA_DIR).filter((f) => /\.(mp3|wav|m4a|ogg)$/i.test(f));
|
||||
if (FILE_FILTER) {
|
||||
files = files.filter((f) => f.toLowerCase().includes(FILE_FILTER));
|
||||
}
|
||||
if (!files.length) {
|
||||
console.error('No audio files in data/.');
|
||||
process.exit(1);
|
||||
}
|
||||
const cache = await readCache();
|
||||
const results = [];
|
||||
|
||||
let count = 0;
|
||||
for (const f of files) {
|
||||
if (MAX && count >= MAX) break;
|
||||
const fp = path.join(DATA_DIR, f);
|
||||
const fromName = parseFromFilename(f);
|
||||
const tags = await getMeta(fp);
|
||||
const artist = tags.artist || fromName.artist;
|
||||
const title = tags.title || fromName.title;
|
||||
const meta = { artist, title, durationMs: tags.durationMs };
|
||||
count++;
|
||||
console.log(`\n[${count}/${MAX || files.length}] ${f}`);
|
||||
console.log(` -> ${artist} — ${title}`);
|
||||
try {
|
||||
const r = await resolveOne(f, meta, cache);
|
||||
results.push(r);
|
||||
console.log(` ✓ Earliest: ${r.earliestDate || 'n/a'} (year=${r.year || 'n/a'}) [${r.fromCache ? 'cache' : 'MB'}]`);
|
||||
} catch (e) {
|
||||
console.warn(' ! Failed:', e.message);
|
||||
results.push({ file: f, title, artist, mbid: null, earliestDate: null, year: null, error: e.message });
|
||||
}
|
||||
}
|
||||
|
||||
// Build index by file
|
||||
const byFile = Object.fromEntries(results.map((r) => [r.file, { year: r.year, date: r.earliestDate, title: r.title, artist: r.artist, mbid: r.mbid }]));
|
||||
const out = { generatedAt: new Date().toISOString(), total: results.length, byFile, results };
|
||||
await fsp.writeFile(OUT_JSON, JSON.stringify(out, null, 2));
|
||||
await writeCache(cache);
|
||||
console.log(`\nWritten ${OUT_JSON} with ${results.length} entries.`);
|
||||
console.log('Cache saved:', path.basename(CACHE_JSON));
|
||||
}
|
||||
|
||||
// Ensure fetch exists in Node <18
|
||||
if (typeof fetch === 'undefined') {
|
||||
const { default: undici } = await import('undici');
|
||||
global.fetch = undici.fetch;
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user