All checks were successful
Build and Push Docker Image / docker (push) Successful in 21s
366 lines
12 KiB
JavaScript
366 lines
12 KiB
JavaScript
// Resolve earliest release year for songs in data/ using MusicBrainz
|
||
// Usage: node scripts/resolve-years.js [--max N] [--force]
|
||
// Respects MusicBrainz 1 req/sec guideline and caches results.
|
||
|
||
import fs from 'fs';
|
||
import fsp from 'fs/promises';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
import { parseFile as mmParseFile } from 'music-metadata';
|
||
import { setTimeout as wait } from 'timers/promises';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
const ROOT = path.resolve(__dirname, '..');
|
||
const DATA_DIR = path.join(ROOT, 'data');
|
||
const OUT_JSON = path.join(DATA_DIR, 'years.json');
|
||
const CACHE_JSON = path.join(DATA_DIR, '.mb_cache.json');
|
||
|
||
const CONTACT = process.env.MB_CONTACT || 'local';
|
||
const USER_AGENT = `hitstar-years/0.1.0 (${CONTACT})`;
|
||
|
||
const args = new Set(process.argv.slice(2));
|
||
function getArgValue(name, defVal) {
|
||
const i = process.argv.findIndex((a) => a === name || a.startsWith(name + '='));
|
||
if (i === -1) return defVal;
|
||
const a = process.argv[i];
|
||
if (a.includes('=')) return a.split('=')[1];
|
||
return process.argv[i + 1] && !process.argv[i + 1].startsWith('--')
|
||
? process.argv[i + 1]
|
||
: defVal;
|
||
}
|
||
const MAX = parseInt(getArgValue('--max', '0'), 10) || 0;
|
||
const FORCE = args.has('--force');
|
||
const FILE_FILTER = getArgValue('--file', '').toLowerCase();
|
||
|
||
function normalize(str) {
|
||
if (!str) return '';
|
||
let s = String(str)
|
||
.replace(/\s*\([^)]*(feat\.|ft\.|featuring)[^)]*\)/gi, '') // remove (feat. ...)
|
||
.replace(
|
||
/\s*\[(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\]/gi,
|
||
''
|
||
)
|
||
.replace(
|
||
/\s*-\s*(?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version)\b/gi,
|
||
''
|
||
)
|
||
.replace(
|
||
/\s*\((?:radio edit|remaster(?:ed)?(?: \d{2,4})?|single version|album version|mono|stereo|live|version|short mix|original mix|201\d remaster|20\d\d remaster)\)/gi,
|
||
''
|
||
)
|
||
.replace(/\s*&\s*/g, ' and ')
|
||
.replace(/\s+feat\.?\s+/gi, ' ')
|
||
.replace(/\s+ft\.?\s+/gi, ' ')
|
||
.replace(/[“”]/g, '"')
|
||
.replace(/[’‘']/g, "'")
|
||
.replace(/[^a-z0-9'"\s]/gi, ' ')
|
||
.replace(/\s+/g, ' ')
|
||
.trim()
|
||
.toLowerCase();
|
||
// remove trailing quotes or hyphens
|
||
// trim leading/trailing dashes/spaces
|
||
s = s.replace(/^[-\s]+/, '').replace(/[-\s]+$/, '');
|
||
return s;
|
||
}
|
||
|
||
function parseFromFilename(file) {
|
||
const base = path.parse(file).name;
|
||
const m = base.match(/^(.*?)\s+-\s+(.*)$/); // Artist - Title
|
||
if (m) {
|
||
return { artist: m[1].trim(), title: m[2].trim() };
|
||
}
|
||
return { artist: '', title: base };
|
||
}
|
||
|
||
async function getMeta(fp) {
|
||
try {
|
||
const meta = await mmParseFile(fp, { duration: true });
|
||
return {
|
||
title: meta.common.title || '',
|
||
artist: meta.common.artist || '',
|
||
durationMs: Number.isFinite(meta.format.duration)
|
||
? Math.round(meta.format.duration * 1000)
|
||
: null,
|
||
yearTag: meta.common.year || null,
|
||
};
|
||
} catch {
|
||
return { title: '', artist: '', durationMs: null, yearTag: null };
|
||
}
|
||
}
|
||
|
||
async function readCache() {
|
||
try {
|
||
const j = JSON.parse(await fsp.readFile(CACHE_JSON, 'utf8'));
|
||
return j || {};
|
||
} catch {
|
||
return {};
|
||
}
|
||
}
|
||
|
||
async function writeCache(cache) {
|
||
await fsp.writeFile(CACHE_JSON, JSON.stringify(cache, null, 2));
|
||
}
|
||
|
||
function similar(a, b) {
|
||
a = normalize(a);
|
||
b = normalize(b);
|
||
if (!a || !b) return 0;
|
||
if (a === b) return 1;
|
||
// simple token overlap Jaccard
|
||
const as = new Set(a.split(' '));
|
||
const bs = new Set(b.split(' '));
|
||
const inter = [...as].filter((x) => bs.has(x)).length;
|
||
const union = new Set([...as, ...bs]).size;
|
||
return inter / union;
|
||
}
|
||
|
||
async function mbFetchJson(url, retries = 3) {
|
||
for (let i = 0; i < retries; i++) {
|
||
const res = await fetch(url, {
|
||
headers: { 'User-Agent': USER_AGENT, Accept: 'application/json' },
|
||
});
|
||
if (res.status === 503 || res.status === 429) {
|
||
const ra = Number(res.headers.get('Retry-After')) || 2;
|
||
await wait(ra * 1000);
|
||
continue;
|
||
}
|
||
if (!res.ok) {
|
||
const text = await res.text().catch(() => '');
|
||
throw new Error(`HTTP ${res.status} ${res.statusText} - ${text}`);
|
||
}
|
||
return res.json();
|
||
}
|
||
throw new Error('Failed after retries');
|
||
}
|
||
|
||
async function searchRecording(artist, title) {
|
||
const q = `recording:"${title}" AND artist:"${artist}"`;
|
||
const url = `https://musicbrainz.org/ws/2/recording?fmt=json&limit=25&query=${encodeURIComponent(q)}`;
|
||
const json = await mbFetchJson(url);
|
||
await wait(1300); // rate limit
|
||
return json.recordings || [];
|
||
}
|
||
|
||
async function getRecordingDetails(mbid) {
|
||
const url = `https://musicbrainz.org/ws/2/recording/${encodeURIComponent(mbid)}?fmt=json&inc=releases+artist-credits`;
|
||
const json = await mbFetchJson(url);
|
||
await wait(1300); // rate limit
|
||
return json;
|
||
}
|
||
|
||
function pickBestRecording(candidates, artist, title, durationMs) {
|
||
const nArtist = normalize(artist);
|
||
const nTitle = normalize(title);
|
||
let best = null;
|
||
let bestScore = -Infinity;
|
||
const viable = [];
|
||
for (const r of candidates) {
|
||
const rTitle = r.title || '';
|
||
const rArtists = (r['artist-credit'] || [])
|
||
.map((ac) => ac.name || ac.artist?.name)
|
||
.filter(Boolean)
|
||
.join(' ');
|
||
const titleSim = similar(rTitle, nTitle);
|
||
const artistSim = similar(rArtists, nArtist);
|
||
let score = (r.score || 0) / 100 + titleSim * 1.5 + artistSim * 1.2;
|
||
if (durationMs && r.length) {
|
||
const diff = Math.abs(r.length - durationMs);
|
||
const durScore = Math.max(0, 1 - Math.min(diff, 15000) / 15000); // within 15s window
|
||
score += durScore * 0.8;
|
||
}
|
||
// Prefer those with more releases (more evidence)
|
||
if (Array.isArray(r.releases)) score += Math.min(5, r.releases.length) * 0.05;
|
||
const firstYear = parseDateToYear(r['first-release-date']);
|
||
if (firstYear) {
|
||
// Tiny bias towards older original releases
|
||
const ageBias = Math.max(0, 2100 - firstYear) / 2100; // ~0.5 for 1050, ~0.95 for 100
|
||
score += ageBias * 0.3;
|
||
}
|
||
if (score > bestScore) {
|
||
bestScore = score;
|
||
best = r;
|
||
}
|
||
if (titleSim >= 0.55 && artistSim >= 0.55) {
|
||
viable.push({ r, firstYear: firstYear || null, titleSim, artistSim, score });
|
||
}
|
||
}
|
||
// Among viable matches, prefer the one with the earliest known first release year
|
||
const withYear = viable.filter((v) => v.firstYear);
|
||
if (withYear.length) {
|
||
withYear.sort((a, b) => a.firstYear - b.firstYear || b.score - a.score);
|
||
return withYear[0].r;
|
||
}
|
||
return best;
|
||
}
|
||
|
||
function parseDateToYear(dateStr) {
|
||
if (!dateStr) return null;
|
||
const re = /^(\d{4})/;
|
||
const m = re.exec(String(dateStr));
|
||
return m ? Number(m[1]) : null;
|
||
}
|
||
|
||
function earliestDate(dates) {
|
||
const valid = dates
|
||
.filter(Boolean)
|
||
.map((d) => ({ d, y: parseDateToYear(d) }))
|
||
.filter((x) => x.y);
|
||
if (!valid.length) return { date: null, year: null };
|
||
valid.sort((a, b) => {
|
||
if (a.d < b.d) return -1;
|
||
if (a.d > b.d) return 1;
|
||
return 0;
|
||
});
|
||
return { date: valid[0].d, year: valid[0].y };
|
||
}
|
||
|
||
async function resolveOne(file, meta, cache) {
|
||
const key = `${normalize(meta.artist)}|${normalize(meta.title)}`;
|
||
if (!FORCE && cache[key]) return { ...cache[key], fromCache: true };
|
||
if (!meta.artist || !meta.title) throw new Error('Missing artist/title');
|
||
|
||
const recs = await searchRecording(meta.artist, meta.title);
|
||
if (!recs.length) throw new Error('No recordings found');
|
||
const best = pickBestRecording(recs, meta.artist, meta.title, meta.durationMs);
|
||
if (!best) throw new Error('No suitable match');
|
||
let firstDate = best['first-release-date'] || null;
|
||
let year = parseDateToYear(firstDate);
|
||
// If no year on best, or if best appears to be a later reissue, inspect more candidates
|
||
const viable = recs
|
||
.map((r) => ({
|
||
r,
|
||
titleSim: similar(r.title || '', meta.title),
|
||
artistSim: similar(
|
||
(r['artist-credit'] || [])
|
||
.map((ac) => ac.name || ac.artist?.name)
|
||
.filter(Boolean)
|
||
.join(' '),
|
||
meta.artist
|
||
),
|
||
firstYear: parseDateToYear(r['first-release-date']) || null,
|
||
}))
|
||
.filter((v) => v.titleSim >= 0.5 && v.artistSim >= 0.5);
|
||
|
||
// Determine earliest among top candidates, fetching details when missing
|
||
let earliest = { year: year || Infinity, date: firstDate || null, id: best.id };
|
||
const detailsBudget = 5; // limit extra calls per track
|
||
let detailsUsed = 0;
|
||
for (const v of viable.slice(0, 10)) {
|
||
let y = v.firstYear;
|
||
let d = v.r['first-release-date'] || null;
|
||
if (!y && detailsUsed < detailsBudget) {
|
||
try {
|
||
const details = await getRecordingDetails(v.r.id);
|
||
detailsUsed++;
|
||
const dates = (details.releases || []).map(
|
||
(re) => re.date || re['release-events']?.[0]?.date || null
|
||
);
|
||
const er = earliestDate(dates);
|
||
y = er.year;
|
||
d = er.date;
|
||
} catch {}
|
||
}
|
||
if (y && y < (earliest.year || Infinity)) {
|
||
earliest = { year: y, date: d, id: v.r.id };
|
||
}
|
||
}
|
||
if (earliest.year && earliest.year !== year) {
|
||
year = earliest.year;
|
||
firstDate = earliest.date;
|
||
}
|
||
const result = {
|
||
file,
|
||
title: meta.title,
|
||
artist: meta.artist,
|
||
mbid: earliest.id || best.id,
|
||
earliestDate: firstDate,
|
||
year,
|
||
confidence: {
|
||
mbScore: best.score || null,
|
||
titleSim: similar(best.title || '', meta.title),
|
||
artistSim: similar(
|
||
(best['artist-credit'] || [])
|
||
.map((ac) => ac.name || ac.artist?.name)
|
||
.filter(Boolean)
|
||
.join(' '),
|
||
meta.artist
|
||
),
|
||
durationMs: meta.durationMs,
|
||
matchedDurationMs: best.length || null,
|
||
},
|
||
};
|
||
cache[key] = result;
|
||
return result;
|
||
}
|
||
|
||
async function main() {
|
||
console.log('Scanning data dir:', DATA_DIR);
|
||
let files = fs.readdirSync(DATA_DIR).filter((f) => /\.(mp3|wav|m4a|ogg)$/i.test(f));
|
||
if (FILE_FILTER) {
|
||
files = files.filter((f) => f.toLowerCase().includes(FILE_FILTER));
|
||
}
|
||
if (!files.length) {
|
||
console.error('No audio files in data/.');
|
||
process.exit(1);
|
||
}
|
||
const cache = await readCache();
|
||
const results = [];
|
||
|
||
let count = 0;
|
||
for (const f of files) {
|
||
if (MAX && count >= MAX) break;
|
||
const fp = path.join(DATA_DIR, f);
|
||
const fromName = parseFromFilename(f);
|
||
const tags = await getMeta(fp);
|
||
const artist = tags.artist || fromName.artist;
|
||
const title = tags.title || fromName.title;
|
||
const meta = { artist, title, durationMs: tags.durationMs };
|
||
count++;
|
||
console.log(`\n[${count}/${MAX || files.length}] ${f}`);
|
||
console.log(` -> ${artist} — ${title}`);
|
||
try {
|
||
const r = await resolveOne(f, meta, cache);
|
||
results.push(r);
|
||
console.log(
|
||
` ✓ Earliest: ${r.earliestDate || 'n/a'} (year=${r.year || 'n/a'}) [${r.fromCache ? 'cache' : 'MB'}]`
|
||
);
|
||
} catch (e) {
|
||
console.warn(' ! Failed:', e.message);
|
||
results.push({
|
||
file: f,
|
||
title,
|
||
artist,
|
||
mbid: null,
|
||
earliestDate: null,
|
||
year: null,
|
||
error: e.message,
|
||
});
|
||
}
|
||
}
|
||
|
||
// Build index by file
|
||
const byFile = Object.fromEntries(
|
||
results.map((r) => [
|
||
r.file,
|
||
{ year: r.year, date: r.earliestDate, title: r.title, artist: r.artist, mbid: r.mbid },
|
||
])
|
||
);
|
||
const out = { generatedAt: new Date().toISOString(), total: results.length, byFile, results };
|
||
await fsp.writeFile(OUT_JSON, JSON.stringify(out, null, 2));
|
||
await writeCache(cache);
|
||
console.log(`\nWritten ${OUT_JSON} with ${results.length} entries.`);
|
||
console.log('Cache saved:', path.basename(CACHE_JSON));
|
||
}
|
||
|
||
// Ensure fetch exists in Node <18
|
||
if (typeof fetch === 'undefined') {
|
||
const { default: undici } = await import('undici');
|
||
global.fetch = undici.fetch;
|
||
}
|
||
|
||
main().catch((e) => {
|
||
console.error(e);
|
||
process.exit(1);
|
||
});
|