DeepSpeech/native_client/deepspeech.cc at master · ashgorithm/DeepSpeech

319 lines (271 loc) · 9.75 KB
#ifdef DS_NATIVE_MODEL
#define EIGEN_USE_THREADS
#define EIGEN_USE_CUSTOM_THREAD_POOL
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "native_client/deepspeech_model.h" // generated
#include <iostream>
#include "deepspeech.h"
#include "deepspeech_utils.h"
#include "alphabet.h"
#include "beam_search.h"
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/platform/env.h"
#define BATCH_SIZE 1
using namespace tensorflow;
using tensorflow::ctc::CTCBeamSearchDecoder;
using tensorflow::ctc::CTCDecoder;
namespace DeepSpeech {
class Private {
    Session* session;
    GraphDef graph_def;
    int ncep;
    int ncontext;
    Alphabet* alphabet;
    KenLMBeamScorer* scorer;
    int beam_width;
    bool run_aot;
Model::Model(const char* aModelPath, int aNCep, int aNContext,
             const char* aAlphabetConfigPath, int aBeamWidth)
  mPriv             = new Private;
  mPriv->session    = NULL;
  mPriv->scorer     = NULL;
  mPriv->ncep       = aNCep;
  mPriv->ncontext   = aNContext;
  mPriv->alphabet   = new Alphabet(aAlphabetConfigPath);
  mPriv->beam_width = aBeamWidth;
  mPriv->run_aot    = false;
  if (!aModelPath || strlen(aModelPath) < 1) {
    std::cerr << "No model specified, will rely on built-in model." << std::endl;
    mPriv->run_aot = true;
    return;
  Status status = NewSession(SessionOptions(), &mPriv->session);
  if (!status.ok()) {
    std::cerr << status.ToString() << std::endl;
    return;
  status = ReadBinaryProto(Env::Default(), aModelPath, &mPriv->graph_def);
  if (!status.ok()) {
    mPriv->session->Close();
    mPriv->session = NULL;
    std::cerr << status.ToString() << std::endl;
    return;
  status = mPriv->session->Create(mPriv->graph_def);
  if (!status.ok()) {
    mPriv->session->Close();
    mPriv->session = NULL;
    std::cerr << status.ToString() << std::endl;
    return;
  for (int i = 0; i < mPriv->graph_def.node_size(); ++i) {
    NodeDef node = mPriv->graph_def.node(i);
    if (node.name() == "logits/shape/2") {
      int final_dim_size = node.attr().at("value").tensor().int_val(0) - 1;
      if (final_dim_size != mPriv->alphabet->GetSize()) {
        std::cerr << "Error: Alphabet size does not match loaded model: alphabet "
                  << "has size " << mPriv->alphabet->GetSize()
                  << ", but model has " << final_dim_size
                  << " classes in its output. Make sure you're passing an alphabet "
                  << "file with the same size as the one used for training."
                  << std::endl;
        mPriv->session->Close();
        mPriv->session = NULL;
        return;
      break;
Model::~Model()
  if (mPriv->session) {
    mPriv->session->Close();
  delete mPriv->alphabet;
  delete mPriv->scorer;
  delete mPriv;
Model::enableDecoderWithLM(const char* aAlphabetConfigPath, const char* aLMPath,
                           const char* aTriePath, float aLMWeight,
                           float aWordCountWeight, float aValidWordCountWeight)
  mPriv->scorer = new KenLMBeamScorer(aLMPath, aTriePath, aAlphabetConfigPath,
                                      aLMWeight, aWordCountWeight, aValidWordCountWeight);
Model::getInputVector(const short* aBuffer, unsigned int aBufferSize,
                           int aSampleRate, float** aMfcc, int* aNFrames,
                           int* aFrameLen)
  return audioToInputVector(aBuffer, aBufferSize, aSampleRate, mPriv->ncep,
                            mPriv->ncontext, aMfcc, aNFrames, aFrameLen);
Model::decode(int aNFrames, float*** aLogits)
  const int batch_size = BATCH_SIZE;
  const int top_paths = 1;
  const int timesteps = aNFrames;
  const size_t num_classes = mPriv->alphabet->GetSize() + 1; // +1 for blank
  // Raw data containers (arrays of floats, ints, etc.).
  int sequence_lengths[batch_size] = {timesteps};
  // Convert data containers to the format accepted by the decoder, simply
  // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
  // using Eigen::Map.
  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
  inputs.reserve(timesteps);
  for (int t = 0; t < timesteps; ++t) {
    inputs.emplace_back(&aLogits[t][0][0], batch_size, num_classes);
  // Prepare containers for output and scores.
  // CTCDecoder::Output is std::vector<std::vector<int>>
  std::vector<CTCDecoder::Output> decoder_outputs(top_paths);
  for (CTCDecoder::Output& output : decoder_outputs) {
    output.resize(batch_size);
  float score[batch_size][top_paths] = {{0.0}};
  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
  if (mPriv->scorer == NULL) {
    CTCBeamSearchDecoder<>::DefaultBeamScorer scorer;
    CTCBeamSearchDecoder<> decoder(num_classes,
                                   mPriv->beam_width,
                                   batch_size);
    decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
    CTCBeamSearchDecoder<KenLMBeamState> decoder(num_classes,
                                                 mPriv->beam_width,
                                                 mPriv->scorer,
    decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
  // Output is an array of shape (1, n_results, result_length).
  // In this case, n_results is also equal to 1.
  size_t output_length = decoder_outputs[0][0].size() + 1;
  size_t decoded_length = 1; // add 1 for the \0
  for (int i = 0; i < output_length - 1; i++) {
    int64 character = decoder_outputs[0][0][i];
    const std::string& str = mPriv->alphabet->StringFromLabel(character);
    decoded_length += str.size();
  char* output = (char*)malloc(sizeof(char) * decoded_length);
  char* pen = output;
  for (int i = 0; i < output_length - 1; i++) {
    int64 character = decoder_outputs[0][0][i];
    const std::string& str = mPriv->alphabet->StringFromLabel(character);
    strncpy(pen, str.c_str(), str.size());
    pen += str.size();
  *pen = '\0';
  for (int i = 0; i < timesteps; ++i) {
    for (int j = 0; j < batch_size; ++j) {
      free(aLogits[i][j]);
    free(aLogits[i]);
  free(aLogits);
  return output;
Model::infer(float* aMfcc, int aNFrames, int aFrameLen)
  const int batch_size = BATCH_SIZE;
  const int timesteps = aNFrames;
  const size_t num_classes = mPriv->alphabet->GetSize() + 1; // +1 for blank
  const int frameSize = mPriv->ncep + (2 * mPriv->ncep * mPriv->ncontext);
  float*** input_data_mat = (float***)calloc(timesteps, sizeof(float**));
  for (int i = 0; i < timesteps; ++i) {
    input_data_mat[i] = (float**)calloc(batch_size, sizeof(float*));
    for (int j = 0; j < batch_size; ++j) {
      input_data_mat[i][j] = (float*)calloc(num_classes, sizeof(float));
  if (mPriv->run_aot) {
#ifdef DS_NATIVE_MODEL
    Eigen::ThreadPool tp(2);  // Size the thread pool as appropriate.
    Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
    nativeModel nm(nativeModel::AllocMode::RESULTS_AND_TEMPS_ONLY);
    nm.set_thread_pool(&device);
    for (int ot = 0; ot < timesteps; ot += DS_MODEL_TIMESTEPS) {
      nm.set_arg0_data(&(aMfcc[ot * frameSize]));
      nm.Run();
      // The CTCDecoder works with log-probs.
      for (int t = 0; t < DS_MODEL_TIMESTEPS, (ot + t) < timesteps; ++t) {
        for (int b = 0; b < batch_size; ++b) {
          for (int c = 0; c < num_classes; ++c) {
            input_data_mat[ot + t][b][c] = nm.result0(t, b, c);
    std::cerr << "No support for native model built-in." << std::endl;
    return NULL;
#endif // DS_NATIVE_MODEL
    if (aFrameLen == 0) {
      aFrameLen = frameSize;
    } else if (aFrameLen < frameSize) {
      std::cerr << "mfcc features array is too small (expected " <<
        frameSize << ", got " << aFrameLen << ")\n";
      return NULL;
    Tensor input(DT_FLOAT, TensorShape({1, aNFrames, frameSize}));
    auto input_mapped = input.tensor<float, 3>();
    for (int i = 0, idx = 0; i < aNFrames; i++) {
      for (int j = 0; j < frameSize; j++, idx++) {
        input_mapped(0, i, j) = aMfcc[idx];
      idx += (aFrameLen - frameSize);
    Tensor n_frames(DT_INT32, TensorShape({1}));
    n_frames.scalar<int>()() = aNFrames;
    // The CTC Beam Search decoder takes logits as input, we can feed those from
    // the "logits" node in official models or
    // the "logits_output_node" in old AOT hacking models
    std::vector<Tensor> outputs;
    Status status = mPriv->session->Run(
      {{ "input_node", input }, { "input_lengths", n_frames }},
      {"logits"}, {}, &outputs);
    // If "logits" doesn't exist, this is an older graph. Try to recover.
    if (status.code() == tensorflow::error::NOT_FOUND) {
      status.IgnoreError();
      status = mPriv->session->Run(
        {{ "input_node", input }, { "input_lengths", n_frames }},
        {"logits_output_node"}, {}, &outputs);
    if (!status.ok()) {
      std::cerr << "Error running session: " << status.ToString() << "\n";
      return NULL;
    auto logits_mapped = outputs[0].tensor<float, 3>();
    // The CTCDecoder works with log-probs.
    for (int t = 0; t < timesteps; ++t) {
      for (int b = 0; b < batch_size; ++b) {
        for (int c = 0; c < num_classes; ++c) {
          input_data_mat[t][b][c] = logits_mapped(t, b, c);
  return decode(aNFrames, input_data_mat);
Model::stt(const short* aBuffer, unsigned int aBufferSize, int aSampleRate)
  float* mfcc;
  char* string;
  int n_frames;
  getInputVector(aBuffer, aBufferSize, aSampleRate, &mfcc, &n_frames, NULL);
  string = infer(mfcc, n_frames);
  free(mfcc);
  return string;
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

deepspeech.cc

Latest commit

History

deepspeech.cc

File metadata and controls