#! /usr/bin/env python """ Script to evaluate pitch algorithms against TONAS database. See http://mtg.upf.edu/download/datasets/tonas/ Example run: $ ./eval_pitch /path/to/TONAS/*/*.wav OK: 94.74% vx r: 96.87% vx f: 15.83% f0: 96.02% %12: 0.50% /path/to/TONAS/Deblas/01-D_AMairena.wav OK: 89.89% vx r: 93.21% vx f: 13.81% f0: 90.74% %12: 1.51% /path/to/TONAS/Deblas/02-D_ChanoLobato.wav OK: 96.02% vx r: 96.73% vx f: 10.91% f0: 96.42% %12: 0.00% /path/to/TONAS/Deblas/03-D_Chocolate.wav [...] OK: 82.35% vx r: 95.52% vx f: 67.09% f0: 89.80% %12: 0.95% /path/to/TONAS/Martinetes2/80-M2_Rancapinos.wav OK: 61.97% vx r: 85.71% vx f: 22.03% f0: 55.63% %12: 8.57% /path/to/TONAS/Martinetes2/81-M2_SDonday.wav OK: 75.26% vx r: 91.63% vx f: 27.27% f0: 75.99% %12: 5.05% /path/to/TONAS/Martinetes2/82-M2_TiaAnicalaPiriniaca.wav OK: 82.77% vx r: 92.74% vx f: 38.27% f0: 87.33% %12: 1.67% 69 files, total_length: 1177.69s, total runtime: 25.91s """ import sys import time import os.path import numpy from .utils import array_from_text_file, array_from_yaml_file from aubio import source, pitch, freqtomidi start = time.time() freq_tol = .50 # more or less half a tone methods = ["default", "yinfft", "mcomb", "yin", "fcomb", "schmitt", "specacf"] method = methods[0] downsample = 1 tolerance = 0.35 silence = -40. skip = 1 if method in ["yinfft", "default"]: downsample = 1 tolerance = 0.45 elif method == "mcomb": downsample = 4 elif method == "yin": downsample = 4 tolerance = 0.2 samplerate = 44100 / downsample hop_s = 512 / downsample win_s = 2048 / downsample def get_pitches (filename, samplerate = samplerate, win_s = win_s, hop_s = hop_s): s = source(filename, samplerate, hop_s) samplerate = s.samplerate p = pitch(method, win_s, hop_s, samplerate) p.set_unit("freq") p.set_tolerance(tolerance) p.set_silence(silence) # list of pitches, in samples pitches = [] # total number of frames read total_frames = 0 while True: samples, read = s() new_pitch = p(samples)[0] pitches.append([total_frames/float(samplerate), new_pitch]) total_frames += read if read < hop_s: break return numpy.array(pitches) total_correct_f0, total_correct_sil, total_missed, total_incorrect, total_fp, total_total = 0, 0, 0, 0, 0, 0 total_correct_chroma, total_voiced = 0, 0 for source_file in sys.argv[1:]: ground_truth_file = source_file.replace('.wav', '.f0.Corrected') if os.path.isfile(ground_truth_file): ground_truth = array_from_text_file(ground_truth_file)[:,[0,2]] experiment = get_pitches(source_file) # check that we have the same length, more or less one frame assert abs(len(ground_truth) - len(experiment)) < 2 # align experiment by skipping first results experiment = experiment[skip:] experiment[:,0] -= experiment[0,0] # trim to shortest list maxlen = min(len(ground_truth), len(experiment)) experiment = experiment[:maxlen] ground_truth = ground_truth[:maxlen] # get difference matrix diffmat = abs(experiment - ground_truth) # make sure we got the timing right assert max(diffmat[:,0]) < 10e-4, source_file truth_pitches = freqtomidi(ground_truth[:,1]) exper_pitches = freqtomidi(experiment[:,1]) total = len(truth_pitches) unvoiced = len(truth_pitches[truth_pitches == 0]) voiced = total - unvoiced correct_sil, fp, missed, correct_f0, correct_chroma, incorrect = 0, 0, 0, 0, 0, 0 for a, b in zip(truth_pitches, exper_pitches): if a == 0 and b == 0: correct_sil += 1 elif a == 0 and b != 0: fp += 1 elif a != 0 and b == 0: missed += 1 elif abs(b - a) < freq_tol: correct_f0 += 1 elif abs(b - a) % 12. < freq_tol: correct_chroma += 1 else: incorrect += 1 assert correct_sil + fp + missed + correct_f0 + correct_chroma + incorrect == total assert unvoiced == correct_sil + fp assert voiced == missed + correct_f0 + correct_chroma + incorrect print "OK: %6s%%" % ("%.2f" % (100. * (correct_f0 + correct_sil) / total )), print "vx r: %6s%%" % ("%.2f" % (100. - 100. * missed / voiced)), print "vx f: %6s%%" % ("%.2f" % (100. * fp / unvoiced)), print "f0: %6s%%" % ("%.2f" % (100. * correct_f0 / voiced)), print "%%12: %6s%%" % ("%.2f" % (100. * correct_chroma / voiced)), print source_file total_correct_sil += correct_sil total_correct_f0 += correct_f0 total_correct_chroma += correct_chroma total_missed += missed total_incorrect += incorrect total_fp += fp total_voiced += voiced total_total += total else: print "ERR", "could not find ground_truth_file", ground_truth_file print "OK: %6s%%" % ("%.2f" % (100. * (total_correct_f0 + total_correct_sil) / total_total )), print "vx r: %6s%%" % ("%.2f" % (100. - 100. * total_missed / total_voiced)), print "vx f: %6s%%" % ("%.2f" % (100. * (total_fp) / (total_correct_sil + total_fp))), print "f0: %6s%%" % ("%.2f" % (100. * total_correct_f0 / total_voiced)), print "%%12: %6s%%" % ("%.2f" % (100. * total_correct_chroma / total_voiced)), print "%d files," % len(sys.argv[1:]), print "total_length: %.2fs," % ((total_total * hop_s) / float(samplerate)), print "total runtime: %.2fs" % (time.time() - start)