init commit

This commit is contained in:
2025-09-03 19:34:00 +02:00
commit 4cbf97cc5a
12 changed files with 11751 additions and 0 deletions

309
scripts/resolve-years.js Normal file
View File

@@ -0,0 +1,309 @@
// Resolve earliest release year for songs in data/ using MusicBrainz
// Usage: node scripts/resolve-years.js [--max N] [--force]
// Respects MusicBrainz 1 req/sec guideline and caches results.
import fs from 'fs';
import fsp from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { parseFile as mmParseFile } from 'music-metadata';
import { setTimeout as wait } from 'timers/promises';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const ROOT = path.resolve(__dirname, '..');
const DATA_DIR = path.join(ROOT, 'data');
const OUT_JSON = path.join(DATA_DIR, 'years.json');
const CACHE_JSON = path.join(DATA_DIR, '.mb_cache.json');
const CONTACT = process.env.MB_CONTACT || 'local';
const USER_AGENT = `hitstar-years/0.1.0 (${CONTACT})`;
const args = new Set(process.argv.slice(2));
function getArgValue(name, defVal) {
const i = process.argv.findIndex((a) => a === name || a.startsWith(name + '='));
if (i === -1) return defVal;
const a = process.argv[i];
if (a.includes('=')) return a.split('=')[1];
return process.argv[i + 1] && !process.argv[i + 1].startsWith('--') ? process.argv[i + 1] : defVal;
}
const MAX = parseInt(getArgValue('--max', '0'), 10) || 0;
const FORCE = args.has('--force');
const FILE_FILTER = getArgValue('--file', '').toLowerCase();
function normalize(str) {
if (!str) return '';
let s = String(str)
.replace(/\s*\([^)]*(feat\.|ft\.|featuring)[^)]*\)/gi, '') // remove (feat. ...)
.replace(/\s*\[(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\]/gi, '')
.replace(/\s*-\s*(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\b/gi, '')
.replace(/\s*\((?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version|short mix|original mix|201\d remaster|20\d\d remaster)\)/gi, '')
.replace(/\s*&\s*/g, ' and ')
.replace(/\s+feat\.?\s+/gi, ' ')
.replace(/\s+ft\.?\s+/gi, ' ')
.replace(/[“”]/g, '"')
.replace(/[']/g, "'")
.replace(/[^a-z0-9'"\s]/gi, ' ')
.replace(/\s+/g, ' ')
.trim()
.toLowerCase();
// remove trailing quotes or hyphens
// trim leading/trailing dashes/spaces
s = s.replace(/^[-\s]+/, '').replace(/[-\s]+$/, '');
return s;
}
function parseFromFilename(file) {
const base = path.parse(file).name;
const m = base.match(/^(.*?)\s+-\s+(.*)$/); // Artist - Title
if (m) {
return { artist: m[1].trim(), title: m[2].trim() };
}
return { artist: '', title: base };
}
async function getMeta(fp) {
try {
const meta = await mmParseFile(fp, { duration: true });
return {
title: meta.common.title || '',
artist: meta.common.artist || '',
durationMs: Number.isFinite(meta.format.duration) ? Math.round(meta.format.duration * 1000) : null,
yearTag: meta.common.year || null,
};
} catch {
return { title: '', artist: '', durationMs: null, yearTag: null };
}
}
async function readCache() {
try {
const j = JSON.parse(await fsp.readFile(CACHE_JSON, 'utf8'));
return j || {};
} catch { return {}; }
}
async function writeCache(cache) {
await fsp.writeFile(CACHE_JSON, JSON.stringify(cache, null, 2));
}
function similar(a, b) {
a = normalize(a); b = normalize(b);
if (!a || !b) return 0;
if (a === b) return 1;
// simple token overlap Jaccard
const as = new Set(a.split(' '));
const bs = new Set(b.split(' '));
const inter = [...as].filter((x) => bs.has(x)).length;
const union = new Set([...as, ...bs]).size;
return inter / union;
}
async function mbFetchJson(url, retries = 3) {
for (let i = 0; i < retries; i++) {
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'application/json' } });
if (res.status === 503 || res.status === 429) {
const ra = Number(res.headers.get('Retry-After')) || 2;
await wait(ra * 1000);
continue;
}
if (!res.ok) {
const text = await res.text().catch(() => '');
throw new Error(`HTTP ${res.status} ${res.statusText} - ${text}`);
}
return res.json();
}
throw new Error('Failed after retries');
}
async function searchRecording(artist, title) {
const q = `recording:"${title}" AND artist:"${artist}"`;
const url = `https://musicbrainz.org/ws/2/recording?fmt=json&limit=25&query=${encodeURIComponent(q)}`;
const json = await mbFetchJson(url);
await wait(1300); // rate limit
return json.recordings || [];
}
async function getRecordingDetails(mbid) {
const url = `https://musicbrainz.org/ws/2/recording/${encodeURIComponent(mbid)}?fmt=json&inc=releases+artist-credits`;
const json = await mbFetchJson(url);
await wait(1300); // rate limit
return json;
}
function pickBestRecording(candidates, artist, title, durationMs) {
const nArtist = normalize(artist);
const nTitle = normalize(title);
let best = null;
let bestScore = -Infinity;
const viable = [];
for (const r of candidates) {
const rTitle = r.title || '';
const rArtists = (r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' ');
const titleSim = similar(rTitle, nTitle);
const artistSim = similar(rArtists, nArtist);
let score = (r.score || 0) / 100 + titleSim * 1.5 + artistSim * 1.2;
if (durationMs && r.length) {
const diff = Math.abs(r.length - durationMs);
const durScore = Math.max(0, 1 - Math.min(diff, 15000) / 15000); // within 15s window
score += durScore * 0.8;
}
// Prefer those with more releases (more evidence)
if (Array.isArray(r.releases)) score += Math.min(5, r.releases.length) * 0.05;
const firstYear = parseDateToYear(r['first-release-date']);
if (firstYear) {
// Tiny bias towards older original releases
const ageBias = Math.max(0, 2100 - firstYear) / 2100; // ~0.5 for 1050, ~0.95 for 100
score += ageBias * 0.3;
}
if (score > bestScore) { bestScore = score; best = r; }
if (titleSim >= 0.55 && artistSim >= 0.55) {
viable.push({ r, firstYear: firstYear || null, titleSim, artistSim, score });
}
}
// Among viable matches, prefer the one with the earliest known first release year
const withYear = viable.filter((v) => v.firstYear);
if (withYear.length) {
withYear.sort((a, b) => a.firstYear - b.firstYear || b.score - a.score);
return withYear[0].r;
}
return best;
}
function parseDateToYear(dateStr) {
if (!dateStr) return null;
const re = /^(\d{4})/;
const m = re.exec(String(dateStr));
return m ? Number(m[1]) : null;
}
function earliestDate(dates) {
const valid = dates.filter(Boolean).map((d) => ({ d, y: parseDateToYear(d) })).filter((x) => x.y);
if (!valid.length) return { date: null, year: null };
valid.sort((a, b) => {
if (a.d < b.d) return -1;
if (a.d > b.d) return 1;
return 0;
});
return { date: valid[0].d, year: valid[0].y };
}
async function resolveOne(file, meta, cache) {
const key = `${normalize(meta.artist)}|${normalize(meta.title)}`;
if (!FORCE && cache[key]) return { ...cache[key], fromCache: true };
if (!meta.artist || !meta.title) throw new Error('Missing artist/title');
const recs = await searchRecording(meta.artist, meta.title);
if (!recs.length) throw new Error('No recordings found');
const best = pickBestRecording(recs, meta.artist, meta.title, meta.durationMs);
if (!best) throw new Error('No suitable match');
let firstDate = best['first-release-date'] || null;
let year = parseDateToYear(firstDate);
// If no year on best, or if best appears to be a later reissue, inspect more candidates
const viable = recs
.map((r) => ({
r,
titleSim: similar(r.title || '', meta.title),
artistSim: similar((r['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist),
firstYear: parseDateToYear(r['first-release-date']) || null,
}))
.filter((v) => v.titleSim >= 0.5 && v.artistSim >= 0.5);
// Determine earliest among top candidates, fetching details when missing
let earliest = { year: year || Infinity, date: firstDate || null, id: best.id };
const detailsBudget = 5; // limit extra calls per track
let detailsUsed = 0;
for (const v of viable.slice(0, 10)) {
let y = v.firstYear;
let d = v.r['first-release-date'] || null;
if (!y && detailsUsed < detailsBudget) {
try {
const details = await getRecordingDetails(v.r.id);
detailsUsed++;
const dates = (details.releases || []).map((re) => re.date || re['release-events']?.[0]?.date || null);
const er = earliestDate(dates);
y = er.year;
d = er.date;
} catch {}
}
if (y && y < (earliest.year || Infinity)) {
earliest = { year: y, date: d, id: v.r.id };
}
}
if (earliest.year && earliest.year !== year) {
year = earliest.year;
firstDate = earliest.date;
}
const result = {
file,
title: meta.title,
artist: meta.artist,
mbid: earliest.id || best.id,
earliestDate: firstDate,
year,
confidence: {
mbScore: best.score || null,
titleSim: similar(best.title || '', meta.title),
artistSim: similar((best['artist-credit'] || []).map((ac) => ac.name || ac.artist?.name).filter(Boolean).join(' '), meta.artist),
durationMs: meta.durationMs,
matchedDurationMs: best.length || null,
},
};
cache[key] = result;
return result;
}
async function main() {
console.log('Scanning data dir:', DATA_DIR);
let files = fs.readdirSync(DATA_DIR).filter((f) => /\.(mp3|wav|m4a|ogg)$/i.test(f));
if (FILE_FILTER) {
files = files.filter((f) => f.toLowerCase().includes(FILE_FILTER));
}
if (!files.length) {
console.error('No audio files in data/.');
process.exit(1);
}
const cache = await readCache();
const results = [];
let count = 0;
for (const f of files) {
if (MAX && count >= MAX) break;
const fp = path.join(DATA_DIR, f);
const fromName = parseFromFilename(f);
const tags = await getMeta(fp);
const artist = tags.artist || fromName.artist;
const title = tags.title || fromName.title;
const meta = { artist, title, durationMs: tags.durationMs };
count++;
console.log(`\n[${count}/${MAX || files.length}] ${f}`);
console.log(` -> ${artist}${title}`);
try {
const r = await resolveOne(f, meta, cache);
results.push(r);
console.log(` ✓ Earliest: ${r.earliestDate || 'n/a'} (year=${r.year || 'n/a'}) [${r.fromCache ? 'cache' : 'MB'}]`);
} catch (e) {
console.warn(' ! Failed:', e.message);
results.push({ file: f, title, artist, mbid: null, earliestDate: null, year: null, error: e.message });
}
}
// Build index by file
const byFile = Object.fromEntries(results.map((r) => [r.file, { year: r.year, date: r.earliestDate, title: r.title, artist: r.artist, mbid: r.mbid }]));
const out = { generatedAt: new Date().toISOString(), total: results.length, byFile, results };
await fsp.writeFile(OUT_JSON, JSON.stringify(out, null, 2));
await writeCache(cache);
console.log(`\nWritten ${OUT_JSON} with ${results.length} entries.`);
console.log('Cache saved:', path.basename(CACHE_JSON));
}
// Ensure fetch exists in Node <18
if (typeof fetch === 'undefined') {
const { default: undici } = await import('undici');
global.fetch = undici.fetch;
}
main().catch((e) => {
console.error(e);
process.exit(1);
});