--- /dev/null
+/*
+ Copyright (C) 2018 Paul Brossier <piem@aubio.org>
+
+ This file is part of aubio.
+
+ aubio is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ aubio is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with aubio. If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+/* CREPE pitch algorithm
+
+ References
+ ----------
+
+ CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
+ Justin Salamon, Peter Li, Juan Pablo Bello. Proceedings of the IEEE
+ International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
+ 2018. Available online at https://arxiv.org/abs/1802.06182
+
+ Original implementation available at https://github.com/marl/crepe
+
+*/
+
+#include "aubio_priv.h"
+
+#include "fmat.h"
+#include "ai/tensor.h"
+#include "ai/conv1d.h"
+#include "ai/maxpool1d.h"
+#include "ai/batchnorm.h"
+#include "ai/dense.h"
+#include "io/file_hdf5.h"
+#include "utils/scale.h"
+
+#define HDF5_FILE_PATH "crepe-model-tiny.h5"
+
+// public prototypes
+typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
+aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
+void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
+void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
+smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
+uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
+ tolerance);
+smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
+
+// static prototypes
+static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
+
+struct _aubio_pitch_crepe_t
+{
+ // number of [conv, maxpool, batchnorm] groups
+ uint_t n_layers;
+ // layers
+ aubio_conv1d_t **conv_layers;
+ aubio_maxpool1d_t **maxpool_layers;
+ aubio_batchnorm_t **batchnorm_layers;
+ aubio_dense_t *dense_layer;
+ // input/output tensors
+ aubio_tensor_t *input_tensor;
+ aubio_tensor_t **maxpool_output;
+ aubio_tensor_t **batchnorm_output;
+ aubio_tensor_t **conv_output;
+ aubio_tensor_t *flattened;
+ aubio_tensor_t *dense_output;
+
+ smpl_t confidence;
+ smpl_t tolerance;
+ aubio_scale_t *scale;
+};
+
+aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
+{
+ aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
+ aubio_tensor_t *block_input;
+ // algorithm constants
+ uint_t input_shape[2] = {1024, 1};
+ uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
+ uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
+ uint_t widths[6] = {512, 64, 64, 64, 64, 64};
+ uint_t maxpool_stride[1] = {2};
+ uint_t l0_stride[1] = {4};
+ uint_t n_dense = 360;
+
+ // local variables
+ uint_t capacity_mode = 0;
+ uint_t capacity = capacity_modes[capacity_mode];
+ uint_t output_shape[2];
+ uint_t i;
+
+ AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
+
+ o->n_layers = 6;
+ // create arrays of layers and tensors
+ o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
+ o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+ o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
+ o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+ o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
+ o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+
+ if (!o->conv_layers || !o->conv_output
+ || !o->maxpool_layers || !o->maxpool_output
+ || !o->batchnorm_layers || !o->batchnorm_output)
+ goto failure;
+
+ // create layers
+ for (i = 0; i < o->n_layers; i++) {
+ uint_t kern_shape[1] = {widths[i]};
+ // create convolutional layers
+ o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
+ if (!o->conv_layers[i]) goto failure;
+ // set padding='same'
+ if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
+ goto failure;
+ }
+ // set stride of first layer
+ if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
+ l0_stride) != AUBIO_OK) ) {
+ goto failure;
+ }
+
+ // create batchnorm layers
+ o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity);
+ if (!o->batchnorm_layers[i]) goto failure;
+
+ // create maxpool layers
+ o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
+ if (!o->maxpool_layers[i]) goto failure;
+ }
+
+ o->dense_layer = new_aubio_dense(n_dense);
+ if (!o->dense_layer) goto failure;
+
+ // create input/output tensors
+ o->input_tensor = new_aubio_tensor(2, input_shape);
+ if (!o->input_tensor) goto failure;
+ block_input = o->input_tensor;
+ for (i = 0; i < o->n_layers; i++) {
+ // get shape of conv1d output and create its tensor
+ if (aubio_conv1d_get_output_shape(o->conv_layers[i],
+ block_input, output_shape))
+ goto failure;
+ o->conv_output[i] = new_aubio_tensor(2, output_shape);
+ if (!o->conv_output[i]) goto failure;
+
+ // get shape of batchnorm output and create its tensor
+ if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
+ o->conv_output[i], output_shape))
+ goto failure;
+ o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
+ if (!o->batchnorm_output[i]) goto failure;
+
+ // get shape of maxpool1d output and create its tensor
+ if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
+ o->batchnorm_output[i], output_shape))
+ goto failure;
+ o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
+ if (!o->maxpool_output[i]) goto failure;
+
+ // set input for next block
+ block_input = o->maxpool_output[i];
+ }
+
+ uint_t flattened_dim = o->maxpool_output[5]->shape[0];
+ flattened_dim *= o->maxpool_output[5]->shape[1];
+ uint_t dense_input[1] = {flattened_dim};
+ o->flattened = new_aubio_tensor(1, dense_input);
+ if (!o->flattened) goto failure;
+
+ // permute and flatten
+ aubio_tensor_t *permute_input = o->maxpool_output[5];
+ AUBIO_DBG("permute: (%d, %d) ->"
+ " (%d, %d) (permutation=(2, 1))\n",
+ permute_input->shape[0], permute_input->shape[1],
+ permute_input->shape[1], permute_input->shape[0]);
+ AUBIO_DBG("flatten: (%d, %d) -> (%d)\n",
+ permute_input->shape[1], permute_input->shape[0],
+ o->flattened->shape[0]);
+
+ if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
+ goto failure;
+ o->dense_output = new_aubio_tensor(1, output_shape);
+ if (!o->dense_output) goto failure;
+
+ AUBIO_ASSERT(n_dense == output_shape[0]);
+
+ if (aubio_pitch_crepe_load_params(o))
+ goto failure;
+
+ // map output units to midi note
+ smpl_t start = 1997.379408437619;
+ smpl_t end = 7180.;
+ o->scale = new_aubio_scale(0., 359., start, start + end);
+ if (!o->scale) goto failure;
+
+ return o;
+
+failure:
+ del_aubio_pitch_crepe(o);
+ return NULL;
+}
+
+void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
+{
+ uint_t i;
+ AUBIO_ASSERT(o);
+
+ if (o->input_tensor) {
+ del_aubio_tensor(o->input_tensor);
+ }
+
+ if (o->batchnorm_output) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->batchnorm_output[i])
+ del_aubio_tensor(o->batchnorm_output[i]);
+ }
+ AUBIO_FREE(o->batchnorm_output);
+ }
+
+ if (o->batchnorm_layers) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->batchnorm_layers[i])
+ del_aubio_batchnorm(o->batchnorm_layers[i]);
+ }
+ AUBIO_FREE(o->batchnorm_layers);
+ }
+
+ if (o->maxpool_output) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->maxpool_output[i])
+ del_aubio_tensor(o->maxpool_output[i]);
+ }
+ AUBIO_FREE(o->maxpool_output);
+ }
+
+ if (o->maxpool_layers) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->maxpool_layers[i])
+ del_aubio_maxpool1d(o->maxpool_layers[i]);
+ }
+ AUBIO_FREE(o->maxpool_layers);
+ }
+
+ if (o->conv_output) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->conv_output[i])
+ del_aubio_tensor(o->conv_output[i]);
+ }
+ AUBIO_FREE(o->conv_output);
+ }
+
+ if (o->conv_layers) {
+ for (i = 0; i < o->n_layers; i++) {
+ if (o->conv_layers[i])
+ del_aubio_conv1d(o->conv_layers[i]);
+ }
+ AUBIO_FREE(o->conv_layers);
+ }
+
+ if (o->flattened) {
+ del_aubio_tensor(o->flattened);
+ }
+
+ if (o->dense_layer) {
+ del_aubio_dense(o->dense_layer);
+ }
+
+ if (o->dense_output) {
+ del_aubio_tensor(o->dense_output);
+ }
+
+ if (o->scale) {
+ del_aubio_scale(o->scale);
+ }
+
+ AUBIO_FREE(o);
+}
+
+void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
+{
+ uint_t i;
+ AUBIO_ASSERT(o && input);
+ // copy input to input tensor
+ AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
+ // normalize frame, removing mean and dividing by std
+ smpl_t mean = fvec_mean(input);
+ fvec_add(input, -mean);
+ smpl_t std = 0.;
+ for (i = 0; i < input->length; i++) {
+ std += SQR(input->data[i]);
+ }
+ std = SQRT(std / (smpl_t)input->length);
+ if (std < 1.e-7) std = 1;
+
+ for (i = 0; i < input->length; i++) {
+ o->input_tensor->data[0][i] = input->data[i] / std;
+ }
+
+ aubio_tensor_t *block_input = o->input_tensor;
+ for (i = 0; i < o->n_layers; i++) {
+ aubio_conv1d_do(o->conv_layers[i], block_input,
+ o->conv_output[i]);
+ aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
+ o->batchnorm_output[i]);
+ aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
+ o->maxpool_output[i]);
+ block_input = o->maxpool_output[i];
+ }
+
+ aubio_tensor_t *permute_input = o->maxpool_output[5];
+ // perform flattening (permutation has no effect here, order unchanged)
+ AUBIO_ASSERT (permute_input->size == o->flattened->size);
+ for (i = 0; i < permute_input->size; i++) {
+ o->flattened->data[0][i] = permute_input->data[0][i];
+ }
+
+ // compute dense layer
+ aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
+
+#if 0
+ // print debug output
+ for (i = 0; i < o->n_layers; i++) {
+ AUBIO_DBG("pitch_crepe: conv1d[%d] %f\n", i,
+ aubio_tensor_max(o->conv_output[i]));
+ AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
+ aubio_tensor_max(o->batchnorm_output[i]));
+ AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
+ aubio_tensor_max(o->maxpool_output[i]));
+ }
+ AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
+#endif
+
+ // find maximum activation
+ fvec_t activations;
+ aubio_tensor_as_fvec(o->dense_output, &activations);
+ uint_t argmax = fvec_max_elem(&activations);
+ o->confidence = activations.data[argmax];
+
+ // skip frames with no activation at all (e.g. silence)
+ // or with insufficient confidence
+ if ((argmax == activations.length - 1)
+ || (o->confidence < o->tolerance)) {
+ out->data[0] = -100.;
+ o->confidence = 0;
+ return;
+ }
+
+ // perform interpolation across neighbouring outputs
+ sint_t start = MAX(0, (sint_t)argmax - 4);
+ uint_t end = MIN(argmax + 5, activations.length);
+
+ smpl_t prod = 0;
+ smpl_t weight = 0;
+ smpl_t scaling = 0;
+ for (i = start; i < end; i++) {
+ scaling = (smpl_t)(i);
+ prod += activations.data[i] * scaling;
+ weight += activations.data[i];
+ }
+ out->data[0] = prod / weight;
+
+ // map output units to midi output
+ aubio_scale_do(o->scale, out);
+
+ // convert cents to midi
+ out->data[0] /= 100.;
+
+ // final bias (f_ref = 10Hz -> 3.48 midi)
+ out->data[0] += 3.486821174621582;
+}
+
+smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
+{
+ return o->confidence;
+}
+
+uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
+ smpl_t tolerance)
+{
+ if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
+ o->tolerance = tolerance;
+ return AUBIO_OK;
+}
+
+smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
+{
+ return o->tolerance;
+}
+
+uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
+{
+ uint_t i;
+ aubio_tensor_t *k = NULL;
+ fvec_t *vec = NULL;
+
+ AUBIO_ASSERT(o);
+
+ aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
+ if (!hdf5) return AUBIO_FAIL;
+
+ // get kernels
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ k = aubio_conv1d_get_kernel(o->conv_layers[i]);
+
+ // push dimension
+ k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
+ k->ndim += 1;
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
+ return AUBIO_FAIL;
+ // pop dimension
+ k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
+ k->ndim -= 1;
+ }
+
+ // get bias vectors
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ vec = aubio_conv1d_get_bias(o->conv_layers[i]);
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+ return AUBIO_FAIL;
+ }
+
+ // batchnorm
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ // get kernel matrix
+ vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+ return AUBIO_FAIL;
+ }
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ // get kernel matrix
+ vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+ return AUBIO_FAIL;
+ }
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ // get kernel matrix
+ vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+ return AUBIO_FAIL;
+ }
+ for (i = 0; i < o->n_layers; i++) {
+ char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
+ char_t key[PATH_MAX];
+ snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+ // get kernel matrix
+ vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
+ // load params from hdf5 into kernel tensor
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+ return AUBIO_FAIL;
+ }
+
+ {
+ char_t *key = "/classifier/classifier_3/kernel:0";
+ fmat_t *d = aubio_dense_get_weights(o->dense_layer);
+ if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
+ return AUBIO_FAIL;
+
+ key = "/classifier/classifier_3/bias:0";
+ fvec_t *v = aubio_dense_get_bias(o->dense_layer);
+ if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
+ return AUBIO_FAIL;
+ }
+
+ if (hdf5) {
+ del_aubio_file_hdf5(hdf5);
+ }
+
+ return AUBIO_OK;
+}