From 57630f640ed6f2a6850043cec13f89af74b67741 Mon Sep 17 00:00:00 2001
From: Paul Brossier <piem@piem.org>
Date: Tue, 8 Jan 2019 16:49:22 +0100
Subject: [PATCH] [pitch_crepe] first version

---
 src/pitch/pitch_crepe.c | 501 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 501 insertions(+)
 create mode 100644 src/pitch/pitch_crepe.c

diff --git a/src/pitch/pitch_crepe.c b/src/pitch/pitch_crepe.c
new file mode 100644
index 00000000..adeafe68
--- /dev/null
+++ b/src/pitch/pitch_crepe.c
@@ -0,0 +1,501 @@
+/*
+  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
+
+  This file is part of aubio.
+
+  aubio is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  aubio is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+/* CREPE pitch algorithm
+
+ References
+ ----------
+
+ CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
+ Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
+ International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
+ 2018. Available online at https://arxiv.org/abs/1802.06182
+
+ Original implementation available at https://github.com/marl/crepe
+
+*/
+
+#include "aubio_priv.h"
+
+#include "fmat.h"
+#include "ai/tensor.h"
+#include "ai/conv1d.h"
+#include "ai/maxpool1d.h"
+#include "ai/batchnorm.h"
+#include "ai/dense.h"
+#include "io/file_hdf5.h"
+#include "utils/scale.h"
+
+#define HDF5_FILE_PATH "crepe-model-tiny.h5"
+
+// public prototypes
+typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
+aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
+void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
+void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
+smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
+uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
+    tolerance);
+smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
+
+// static prototypes
+static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
+
+struct _aubio_pitch_crepe_t
+{
+  // number of [conv, maxpool, batchnorm] groups
+  uint_t n_layers;
+  // layers
+  aubio_conv1d_t **conv_layers;
+  aubio_maxpool1d_t **maxpool_layers;
+  aubio_batchnorm_t **batchnorm_layers;
+  aubio_dense_t *dense_layer;
+  // input/output tensors
+  aubio_tensor_t *input_tensor;
+  aubio_tensor_t **maxpool_output;
+  aubio_tensor_t **batchnorm_output;
+  aubio_tensor_t **conv_output;
+  aubio_tensor_t *flattened;
+  aubio_tensor_t *dense_output;
+
+  smpl_t confidence;
+  smpl_t tolerance;
+  aubio_scale_t *scale;
+};
+
+aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
+{
+  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
+  aubio_tensor_t *block_input;
+  // algorithm constants
+  uint_t input_shape[2] = {1024, 1};
+  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
+  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
+  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
+  uint_t maxpool_stride[1] = {2};
+  uint_t l0_stride[1] = {4};
+  uint_t n_dense = 360;
+
+  // local variables
+  uint_t capacity_mode = 0;
+  uint_t capacity = capacity_modes[capacity_mode];
+  uint_t output_shape[2];
+  uint_t i;
+
+  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
+
+  o->n_layers = 6;
+  // create arrays of layers and tensors
+  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
+  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
+  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
+  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
+
+  if (!o->conv_layers || !o->conv_output
+      || !o->maxpool_layers || !o->maxpool_output
+      || !o->batchnorm_layers || !o->batchnorm_output)
+    goto failure;
+
+  // create layers
+  for (i = 0; i < o->n_layers; i++) {
+    uint_t kern_shape[1] = {widths[i]};
+    // create convolutional layers
+    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
+    if (!o->conv_layers[i]) goto failure;
+    // set padding='same'
+    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
+      goto failure;
+    }
+    // set stride of first layer
+    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
+            l0_stride) != AUBIO_OK) ) {
+      goto failure;
+    }
+
+    // create batchnorm layers
+    o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity);
+    if (!o->batchnorm_layers[i]) goto failure;
+
+    // create maxpool layers
+    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
+    if (!o->maxpool_layers[i]) goto failure;
+  }
+
+  o->dense_layer = new_aubio_dense(n_dense);
+  if (!o->dense_layer) goto failure;
+
+  // create input/output tensors
+  o->input_tensor = new_aubio_tensor(2, input_shape);
+  if (!o->input_tensor) goto failure;
+  block_input = o->input_tensor;
+  for (i = 0; i < o->n_layers; i++) {
+    // get shape of conv1d output and create its tensor
+    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
+          block_input, output_shape))
+      goto failure;
+    o->conv_output[i] = new_aubio_tensor(2, output_shape);
+    if (!o->conv_output[i]) goto failure;
+
+    // get shape of batchnorm output and create its tensor
+    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
+          o->conv_output[i], output_shape))
+      goto failure;
+    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
+    if (!o->batchnorm_output[i]) goto failure;
+
+    // get shape of maxpool1d output and create its tensor
+    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
+          o->batchnorm_output[i], output_shape))
+      goto failure;
+    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
+    if (!o->maxpool_output[i]) goto failure;
+
+    // set input for next block
+    block_input = o->maxpool_output[i];
+  }
+
+  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
+  flattened_dim *= o->maxpool_output[5]->shape[1];
+  uint_t dense_input[1] = {flattened_dim};
+  o->flattened = new_aubio_tensor(1, dense_input);
+  if (!o->flattened) goto failure;
+
+  // permute and flatten
+  aubio_tensor_t *permute_input = o->maxpool_output[5];
+  AUBIO_DBG("permute:           (%d, %d) ->"
+      " (%d, %d) (permutation=(2, 1))\n",
+      permute_input->shape[0], permute_input->shape[1],
+      permute_input->shape[1], permute_input->shape[0]);
+  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
+      permute_input->shape[1], permute_input->shape[0],
+      o->flattened->shape[0]);
+
+  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
+    goto failure;
+  o->dense_output = new_aubio_tensor(1, output_shape);
+  if (!o->dense_output) goto failure;
+
+  AUBIO_ASSERT(n_dense == output_shape[0]);
+
+  if (aubio_pitch_crepe_load_params(o))
+    goto failure;
+
+  // map output units to midi note
+  smpl_t start = 1997.379408437619;
+  smpl_t end = 7180.;
+  o->scale = new_aubio_scale(0., 359., start, start + end);
+  if (!o->scale) goto failure;
+
+  return o;
+
+failure:
+  del_aubio_pitch_crepe(o);
+  return NULL;
+}
+
+void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
+{
+  uint_t i;
+  AUBIO_ASSERT(o);
+
+  if (o->input_tensor) {
+    del_aubio_tensor(o->input_tensor);
+  }
+
+  if (o->batchnorm_output) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->batchnorm_output[i])
+        del_aubio_tensor(o->batchnorm_output[i]);
+    }
+    AUBIO_FREE(o->batchnorm_output);
+  }
+
+  if (o->batchnorm_layers) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->batchnorm_layers[i])
+        del_aubio_batchnorm(o->batchnorm_layers[i]);
+    }
+    AUBIO_FREE(o->batchnorm_layers);
+  }
+
+  if (o->maxpool_output) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->maxpool_output[i])
+        del_aubio_tensor(o->maxpool_output[i]);
+    }
+    AUBIO_FREE(o->maxpool_output);
+  }
+
+  if (o->maxpool_layers) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->maxpool_layers[i])
+        del_aubio_maxpool1d(o->maxpool_layers[i]);
+    }
+    AUBIO_FREE(o->maxpool_layers);
+  }
+
+  if (o->conv_output) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->conv_output[i])
+        del_aubio_tensor(o->conv_output[i]);
+    }
+    AUBIO_FREE(o->conv_output);
+  }
+
+  if (o->conv_layers) {
+    for (i = 0; i < o->n_layers; i++) {
+      if (o->conv_layers[i])
+        del_aubio_conv1d(o->conv_layers[i]);
+    }
+    AUBIO_FREE(o->conv_layers);
+  }
+
+  if (o->flattened) {
+    del_aubio_tensor(o->flattened);
+  }
+
+  if (o->dense_layer) {
+    del_aubio_dense(o->dense_layer);
+  }
+
+  if (o->dense_output) {
+    del_aubio_tensor(o->dense_output);
+  }
+
+  if (o->scale) {
+    del_aubio_scale(o->scale);
+  }
+
+  AUBIO_FREE(o);
+}
+
+void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
+{
+  uint_t i;
+  AUBIO_ASSERT(o && input);
+  // copy input to input tensor
+  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
+  // normalize frame, removing mean and dividing by std
+  smpl_t mean = fvec_mean(input);
+  fvec_add(input, -mean);
+  smpl_t std = 0.;
+  for (i = 0; i < input->length; i++) {
+    std += SQR(input->data[i]);
+  }
+  std = SQRT(std / (smpl_t)input->length);
+  if (std < 1.e-7) std = 1;
+
+  for (i = 0; i < input->length; i++) {
+    o->input_tensor->data[0][i] = input->data[i] / std;
+  }
+
+  aubio_tensor_t *block_input = o->input_tensor;
+  for (i = 0; i < o->n_layers; i++) {
+    aubio_conv1d_do(o->conv_layers[i], block_input,
+        o->conv_output[i]);
+    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
+        o->batchnorm_output[i]);
+    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
+        o->maxpool_output[i]);
+    block_input = o->maxpool_output[i];
+  }
+
+  aubio_tensor_t *permute_input = o->maxpool_output[5];
+  // perform flattening (permutation has no effect here, order unchanged)
+  AUBIO_ASSERT (permute_input->size == o->flattened->size);
+  for (i = 0; i < permute_input->size; i++) {
+    o->flattened->data[0][i] = permute_input->data[0][i];
+  }
+
+  // compute dense layer
+  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
+
+#if 0
+  // print debug output
+  for (i = 0; i < o->n_layers; i++) {
+    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
+        aubio_tensor_max(o->conv_output[i]));
+    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
+        aubio_tensor_max(o->batchnorm_output[i]));
+    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
+        aubio_tensor_max(o->maxpool_output[i]));
+  }
+  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
+#endif
+
+  // find maximum activation
+  fvec_t activations;
+  aubio_tensor_as_fvec(o->dense_output, &activations);
+  uint_t argmax = fvec_max_elem(&activations);
+  o->confidence = activations.data[argmax];
+
+  // skip frames with no activation at all (e.g. silence)
+  // or with insufficient confidence
+  if ((argmax == activations.length - 1)
+      || (o->confidence < o->tolerance)) {
+    out->data[0] = -100.;
+    o->confidence = 0;
+    return;
+  }
+
+  // perform interpolation across neighbouring outputs
+  sint_t start = MAX(0, (sint_t)argmax - 4);
+  uint_t end = MIN(argmax + 5, activations.length);
+
+  smpl_t prod = 0;
+  smpl_t weight = 0;
+  smpl_t scaling = 0;
+  for (i = start; i < end; i++) {
+    scaling = (smpl_t)(i);
+    prod += activations.data[i] * scaling;
+    weight += activations.data[i];
+  }
+  out->data[0] = prod / weight;
+
+  // map output units to midi output
+  aubio_scale_do(o->scale, out);
+
+  // convert cents to midi
+  out->data[0] /= 100.;
+
+  // final bias (f_ref = 10Hz -> 3.48 midi)
+  out->data[0] += 3.486821174621582;
+}
+
+smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
+{
+  return o->confidence;
+}
+
+uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
+    smpl_t tolerance)
+{
+  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
+  o->tolerance = tolerance;
+  return AUBIO_OK;
+}
+
+smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
+{
+  return o->tolerance;
+}
+
+uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
+{
+  uint_t i;
+  aubio_tensor_t *k = NULL;
+  fvec_t *vec = NULL;
+
+  AUBIO_ASSERT(o);
+
+  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
+  if (!hdf5) return AUBIO_FAIL;
+
+  // get kernels
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
+
+    // push dimension
+    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
+    k->ndim += 1;
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
+      return AUBIO_FAIL;
+    // pop dimension
+    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
+    k->ndim -= 1;
+  }
+
+  // get bias vectors
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+      return AUBIO_FAIL;
+  }
+
+  // batchnorm
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    // get kernel matrix
+    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+      return AUBIO_FAIL;
+  }
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    // get kernel matrix
+    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+      return AUBIO_FAIL;
+  }
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    // get kernel matrix
+    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+      return AUBIO_FAIL;
+  }
+  for (i = 0; i < o->n_layers; i++) {
+    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
+    char_t key[PATH_MAX];
+    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
+    // get kernel matrix
+    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
+    // load params from hdf5 into kernel tensor
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
+      return AUBIO_FAIL;
+  }
+
+  {
+    char_t *key = "/classifier/classifier_3/kernel:0";
+    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
+    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
+      return AUBIO_FAIL;
+
+    key = "/classifier/classifier_3/bias:0";
+    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
+    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
+      return AUBIO_FAIL;
+  }
+
+  if (hdf5) {
+    del_aubio_file_hdf5(hdf5);
+  }
+
+  return AUBIO_OK;
+}
-- 
2.11.0