Create a Speech Recognition System in C++ with Free Codes

Intro about Project
Features
Codes for Project

Intro about Project

A Speech Recognition System is a complex application that converts spoken language into text. Implementing such as a system in C++ involves capturing audio input, processing the audio data, and using algorithms to recognize spoken words.

Features

Here is the Name of the Feature which provide in the Project: –

Audio Input
Audio Processing
Speech Recognition Algorithm
User Interface

Codes for Project

Here is the Codes,

1. Audio Input

Using the PortAudio library to capture audio:

#include <iostream>
#include <portaudio.h>

#define SAMPLE_RATE 16000
#define FRAMES_PER_BUFFER 512

static int recordCallback(const void *inputBuffer, void *outputBuffer,
                          unsigned long framesPerBuffer,
                          const PaStreamCallbackTimeInfo *timeInfo,
                          PaStreamCallbackFlags statusFlags,
                          void *userData) {
    std::vector<float> *recordedSamples = (std::vector<float> *)userData;
    const float *input = (const float *)inputBuffer;

    if (input != nullptr) {
        for (unsigned long i = 0; i < framesPerBuffer; ++i) {
            recordedSamples->push_back(input[i]);
        }
    }
    return paContinue;
}

void captureAudio(std::vector<float> &recordedSamples) {
    PaStream *stream;
    PaError err;

    err = Pa_Initialize();
    if (err != paNoError) {
        std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n";
        return;
    }

    err = Pa_OpenDefaultStream(&stream, 1, 0, paFloat32, SAMPLE_RATE,
                               FRAMES_PER_BUFFER, recordCallback, &recordedSamples);
    if (err != paNoError) {
        std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n";
        return;
    }

    err = Pa_StartStream(stream);
    if (err != paNoError) {
        std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n";
        return;
    }

    std::cout << "Recording... Press Enter to stop.\n";
    std::cin.get();

    err = Pa_StopStream(stream);
    if (err != paNoError) {
        std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n";
    }

    err = Pa_CloseStream(stream);
    if (err != paNoError) {
        std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n";
    }

    Pa_Terminate();
}

2. Audio Processing

Using FFTW library for Fast Fourier Transform and MFCC extraction:

#include <fftw3.h>
#include <vector>
#include <cmath>

// Function to compute FFT
std::vector<std::complex<double>> computeFFT(const std::vector<float> &signal) {
    size_t N = signal.size();
    std::vector<std::complex<double>> fftResult(N);

    fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
    fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
    fftw_plan plan = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);

    for (size_t i = 0; i < N; ++i) {
        in[i][0] = signal[i];
        in[i][1] = 0.0;
    }

    fftw_execute(plan);

    for (size_t i = 0; i < N; ++i) {
        fftResult[i] = std::complex<double>(out[i][0], out[i][1]);
    }

    fftw_destroy_plan(plan);
    fftw_free(in);
    fftw_free(out);

    return fftResult;
}

// Function to compute MFCCs
std::vector<double> computeMFCC(const std::vector<float> &signal) {
    // Placeholder for MFCC computation
    std::vector<double> mfcc;
    // TODO: Implement MFCC extraction algorithm
    return mfcc;
}

3. Speech Recognition Algorithm

Using a pre-trained model (e.g., from a library like PocketSphinx):

#include <pocketsphinx.h>

void recognizeSpeech(const std::vector<float> &recordedSamples) {
    ps_decoder_t *ps;
    cmd_ln_t *config;

    config = cmd_ln_init(nullptr, ps_args(), TRUE,
                         "-hmm", "model/en-us/en-us",
                         "-lm", "model/en-us/en-us.lm.bin",
                         "-dict", "model/en-us/cmudict-en-us.dict",
                         nullptr);
    if (config == nullptr) {
        std::cerr << "Failed to create config object, see log for details\n";
        return;
    }

    ps = ps_init(config);
    if (ps == nullptr) {
        std::cerr << "Failed to create recognizer, see log for details\n";
        return;
    }

    ps_start_utt(ps);

    // Convert recorded samples to int16 format
    std::vector<int16_t> samplesInt16(recordedSamples.size());
    for (size_t i = 0; i < recordedSamples.size(); ++i) {
        samplesInt16[i] = static_cast<int16_t>(recordedSamples[i] * 32767);
    }

    ps_process_raw(ps, samplesInt16.data(), samplesInt16.size(), FALSE, FALSE);
    ps_end_utt(ps);

    const char *hyp = ps_get_hyp(ps, nullptr);
    if (hyp != nullptr) {
        std::cout << "Recognized: " << hyp << "\n";
    } else {
        std::cout << "No speech recognized\n";
    }

    ps_free(ps);
    cmd_ln_free_r(config);
}

4. Main Function

Combining all parts:

int main() {
    std::vector<float> recordedSamples;
    captureAudio(recordedSamples);

    std::cout << "Processing audio...\n";
    auto fftResult = computeFFT(recordedSamples);
    auto mfcc = computeMFCC(recordedSamples);

    std::cout << "Recognizing speech...\n";
    recognizeSpeech(recordedSamples);

    return 0;
}