From 57630f640ed6f2a6850043cec13f89af74b67741 Mon Sep 17 00:00:00 2001 From: Paul Brossier Date: Tue, 8 Jan 2019 16:49:22 +0100 Subject: [PATCH] [pitch_crepe] first version --- src/pitch/pitch_crepe.c | 501 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 501 insertions(+) create mode 100644 src/pitch/pitch_crepe.c diff --git a/src/pitch/pitch_crepe.c b/src/pitch/pitch_crepe.c new file mode 100644 index 00000000..adeafe68 --- /dev/null +++ b/src/pitch/pitch_crepe.c @@ -0,0 +1,501 @@ +/* + Copyright (C) 2018 Paul Brossier + + This file is part of aubio. + + aubio is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + aubio is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with aubio. If not, see . + +*/ + +/* CREPE pitch algorithm + + References + ---------- + + CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim, + Justin Salamon, Peter Li, Juan Pablo Bello. Proceedings of the IEEE + International Conference on Acoustics, Speech, and Signal Processing (ICASSP), + 2018. Available online at https://arxiv.org/abs/1802.06182 + + Original implementation available at https://github.com/marl/crepe + +*/ + +#include "aubio_priv.h" + +#include "fmat.h" +#include "ai/tensor.h" +#include "ai/conv1d.h" +#include "ai/maxpool1d.h" +#include "ai/batchnorm.h" +#include "ai/dense.h" +#include "io/file_hdf5.h" +#include "utils/scale.h" + +#define HDF5_FILE_PATH "crepe-model-tiny.h5" + +// public prototypes +typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t; +aubio_pitch_crepe_t *new_aubio_pitch_crepe(void); +void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out); +void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t); +smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o); +uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t + tolerance); +smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o); + +// static prototypes +static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o); + +struct _aubio_pitch_crepe_t +{ + // number of [conv, maxpool, batchnorm] groups + uint_t n_layers; + // layers + aubio_conv1d_t **conv_layers; + aubio_maxpool1d_t **maxpool_layers; + aubio_batchnorm_t **batchnorm_layers; + aubio_dense_t *dense_layer; + // input/output tensors + aubio_tensor_t *input_tensor; + aubio_tensor_t **maxpool_output; + aubio_tensor_t **batchnorm_output; + aubio_tensor_t **conv_output; + aubio_tensor_t *flattened; + aubio_tensor_t *dense_output; + + smpl_t confidence; + smpl_t tolerance; + aubio_scale_t *scale; +}; + +aubio_pitch_crepe_t *new_aubio_pitch_crepe(void) +{ + aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t); + aubio_tensor_t *block_input; + // algorithm constants + uint_t input_shape[2] = {1024, 1}; + uint_t capacity_modes[5] = {4, 8, 16, 24, 32}; + uint_t n_filters[6] = {32, 4, 4, 4, 8, 16}; + uint_t widths[6] = {512, 64, 64, 64, 64, 64}; + uint_t maxpool_stride[1] = {2}; + uint_t l0_stride[1] = {4}; + uint_t n_dense = 360; + + // local variables + uint_t capacity_mode = 0; + uint_t capacity = capacity_modes[capacity_mode]; + uint_t output_shape[2]; + uint_t i; + + AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0); + + o->n_layers = 6; + // create arrays of layers and tensors + o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers); + o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); + o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers); + o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); + o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers); + o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); + + if (!o->conv_layers || !o->conv_output + || !o->maxpool_layers || !o->maxpool_output + || !o->batchnorm_layers || !o->batchnorm_output) + goto failure; + + // create layers + for (i = 0; i < o->n_layers; i++) { + uint_t kern_shape[1] = {widths[i]}; + // create convolutional layers + o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape); + if (!o->conv_layers[i]) goto failure; + // set padding='same' + if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) { + goto failure; + } + // set stride of first layer + if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0], + l0_stride) != AUBIO_OK) ) { + goto failure; + } + + // create batchnorm layers + o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity); + if (!o->batchnorm_layers[i]) goto failure; + + // create maxpool layers + o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride); + if (!o->maxpool_layers[i]) goto failure; + } + + o->dense_layer = new_aubio_dense(n_dense); + if (!o->dense_layer) goto failure; + + // create input/output tensors + o->input_tensor = new_aubio_tensor(2, input_shape); + if (!o->input_tensor) goto failure; + block_input = o->input_tensor; + for (i = 0; i < o->n_layers; i++) { + // get shape of conv1d output and create its tensor + if (aubio_conv1d_get_output_shape(o->conv_layers[i], + block_input, output_shape)) + goto failure; + o->conv_output[i] = new_aubio_tensor(2, output_shape); + if (!o->conv_output[i]) goto failure; + + // get shape of batchnorm output and create its tensor + if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i], + o->conv_output[i], output_shape)) + goto failure; + o->batchnorm_output[i] = new_aubio_tensor(2, output_shape); + if (!o->batchnorm_output[i]) goto failure; + + // get shape of maxpool1d output and create its tensor + if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i], + o->batchnorm_output[i], output_shape)) + goto failure; + o->maxpool_output[i] = new_aubio_tensor(2, output_shape); + if (!o->maxpool_output[i]) goto failure; + + // set input for next block + block_input = o->maxpool_output[i]; + } + + uint_t flattened_dim = o->maxpool_output[5]->shape[0]; + flattened_dim *= o->maxpool_output[5]->shape[1]; + uint_t dense_input[1] = {flattened_dim}; + o->flattened = new_aubio_tensor(1, dense_input); + if (!o->flattened) goto failure; + + // permute and flatten + aubio_tensor_t *permute_input = o->maxpool_output[5]; + AUBIO_DBG("permute: (%d, %d) ->" + " (%d, %d) (permutation=(2, 1))\n", + permute_input->shape[0], permute_input->shape[1], + permute_input->shape[1], permute_input->shape[0]); + AUBIO_DBG("flatten: (%d, %d) -> (%d)\n", + permute_input->shape[1], permute_input->shape[0], + o->flattened->shape[0]); + + if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape)) + goto failure; + o->dense_output = new_aubio_tensor(1, output_shape); + if (!o->dense_output) goto failure; + + AUBIO_ASSERT(n_dense == output_shape[0]); + + if (aubio_pitch_crepe_load_params(o)) + goto failure; + + // map output units to midi note + smpl_t start = 1997.379408437619; + smpl_t end = 7180.; + o->scale = new_aubio_scale(0., 359., start, start + end); + if (!o->scale) goto failure; + + return o; + +failure: + del_aubio_pitch_crepe(o); + return NULL; +} + +void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o) +{ + uint_t i; + AUBIO_ASSERT(o); + + if (o->input_tensor) { + del_aubio_tensor(o->input_tensor); + } + + if (o->batchnorm_output) { + for (i = 0; i < o->n_layers; i++) { + if (o->batchnorm_output[i]) + del_aubio_tensor(o->batchnorm_output[i]); + } + AUBIO_FREE(o->batchnorm_output); + } + + if (o->batchnorm_layers) { + for (i = 0; i < o->n_layers; i++) { + if (o->batchnorm_layers[i]) + del_aubio_batchnorm(o->batchnorm_layers[i]); + } + AUBIO_FREE(o->batchnorm_layers); + } + + if (o->maxpool_output) { + for (i = 0; i < o->n_layers; i++) { + if (o->maxpool_output[i]) + del_aubio_tensor(o->maxpool_output[i]); + } + AUBIO_FREE(o->maxpool_output); + } + + if (o->maxpool_layers) { + for (i = 0; i < o->n_layers; i++) { + if (o->maxpool_layers[i]) + del_aubio_maxpool1d(o->maxpool_layers[i]); + } + AUBIO_FREE(o->maxpool_layers); + } + + if (o->conv_output) { + for (i = 0; i < o->n_layers; i++) { + if (o->conv_output[i]) + del_aubio_tensor(o->conv_output[i]); + } + AUBIO_FREE(o->conv_output); + } + + if (o->conv_layers) { + for (i = 0; i < o->n_layers; i++) { + if (o->conv_layers[i]) + del_aubio_conv1d(o->conv_layers[i]); + } + AUBIO_FREE(o->conv_layers); + } + + if (o->flattened) { + del_aubio_tensor(o->flattened); + } + + if (o->dense_layer) { + del_aubio_dense(o->dense_layer); + } + + if (o->dense_output) { + del_aubio_tensor(o->dense_output); + } + + if (o->scale) { + del_aubio_scale(o->scale); + } + + AUBIO_FREE(o); +} + +void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out) +{ + uint_t i; + AUBIO_ASSERT(o && input); + // copy input to input tensor + AUBIO_ASSERT(input->length == o->input_tensor->shape[0]); + // normalize frame, removing mean and dividing by std + smpl_t mean = fvec_mean(input); + fvec_add(input, -mean); + smpl_t std = 0.; + for (i = 0; i < input->length; i++) { + std += SQR(input->data[i]); + } + std = SQRT(std / (smpl_t)input->length); + if (std < 1.e-7) std = 1; + + for (i = 0; i < input->length; i++) { + o->input_tensor->data[0][i] = input->data[i] / std; + } + + aubio_tensor_t *block_input = o->input_tensor; + for (i = 0; i < o->n_layers; i++) { + aubio_conv1d_do(o->conv_layers[i], block_input, + o->conv_output[i]); + aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i], + o->batchnorm_output[i]); + aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i], + o->maxpool_output[i]); + block_input = o->maxpool_output[i]; + } + + aubio_tensor_t *permute_input = o->maxpool_output[5]; + // perform flattening (permutation has no effect here, order unchanged) + AUBIO_ASSERT (permute_input->size == o->flattened->size); + for (i = 0; i < permute_input->size; i++) { + o->flattened->data[0][i] = permute_input->data[0][i]; + } + + // compute dense layer + aubio_dense_do(o->dense_layer, o->flattened, o->dense_output); + +#if 0 + // print debug output + for (i = 0; i < o->n_layers; i++) { + AUBIO_DBG("pitch_crepe: conv1d[%d] %f\n", i, + aubio_tensor_max(o->conv_output[i])); + AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i, + aubio_tensor_max(o->batchnorm_output[i])); + AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i, + aubio_tensor_max(o->maxpool_output[i])); + } + AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output)); +#endif + + // find maximum activation + fvec_t activations; + aubio_tensor_as_fvec(o->dense_output, &activations); + uint_t argmax = fvec_max_elem(&activations); + o->confidence = activations.data[argmax]; + + // skip frames with no activation at all (e.g. silence) + // or with insufficient confidence + if ((argmax == activations.length - 1) + || (o->confidence < o->tolerance)) { + out->data[0] = -100.; + o->confidence = 0; + return; + } + + // perform interpolation across neighbouring outputs + sint_t start = MAX(0, (sint_t)argmax - 4); + uint_t end = MIN(argmax + 5, activations.length); + + smpl_t prod = 0; + smpl_t weight = 0; + smpl_t scaling = 0; + for (i = start; i < end; i++) { + scaling = (smpl_t)(i); + prod += activations.data[i] * scaling; + weight += activations.data[i]; + } + out->data[0] = prod / weight; + + // map output units to midi output + aubio_scale_do(o->scale, out); + + // convert cents to midi + out->data[0] /= 100.; + + // final bias (f_ref = 10Hz -> 3.48 midi) + out->data[0] += 3.486821174621582; +} + +smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o) +{ + return o->confidence; +} + +uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, + smpl_t tolerance) +{ + if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL; + o->tolerance = tolerance; + return AUBIO_OK; +} + +smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o) +{ + return o->tolerance; +} + +uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o) +{ + uint_t i; + aubio_tensor_t *k = NULL; + fvec_t *vec = NULL; + + AUBIO_ASSERT(o); + + aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH); + if (!hdf5) return AUBIO_FAIL; + + // get kernels + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d/conv%d_3/kernel:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + k = aubio_conv1d_get_kernel(o->conv_layers[i]); + + // push dimension + k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1; + k->ndim += 1; + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k)) + return AUBIO_FAIL; + // pop dimension + k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0; + k->ndim -= 1; + } + + // get bias vectors + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d/conv%d_3/bias:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + vec = aubio_conv1d_get_bias(o->conv_layers[i]); + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) + return AUBIO_FAIL; + } + + // batchnorm + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + // get kernel matrix + vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]); + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) + return AUBIO_FAIL; + } + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + // get kernel matrix + vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]); + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) + return AUBIO_FAIL; + } + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + // get kernel matrix + vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]); + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) + return AUBIO_FAIL; + } + for (i = 0; i < o->n_layers; i++) { + char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0"; + char_t key[PATH_MAX]; + snprintf(key, sizeof(key), fmt_key, i+1, i+1); + // get kernel matrix + vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]); + // load params from hdf5 into kernel tensor + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) + return AUBIO_FAIL; + } + + { + char_t *key = "/classifier/classifier_3/kernel:0"; + fmat_t *d = aubio_dense_get_weights(o->dense_layer); + if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d)) + return AUBIO_FAIL; + + key = "/classifier/classifier_3/bias:0"; + fvec_t *v = aubio_dense_get_bias(o->dense_layer); + if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v)) + return AUBIO_FAIL; + } + + if (hdf5) { + del_aubio_file_hdf5(hdf5); + } + + return AUBIO_OK; +} -- 2.11.0