Intro about Project
A Speech Recognition System is a complex application that converts spoken language into text. Implementing such as a system in C++ involves capturing audio input, processing the audio data, and using algorithms to recognize spoken words.
Features
Here is the Name of the Feature which provide in the Project: –
- Audio Input
- Audio Processing
- Speech Recognition Algorithm
- User Interface
Codes for Project
Here is the Codes,
1. Audio Input
Using the PortAudio library to capture audio:
#include <iostream> #include <portaudio.h> #define SAMPLE_RATE 16000 #define FRAMES_PER_BUFFER 512 static int recordCallback(const void *inputBuffer, void *outputBuffer, unsigned long framesPerBuffer, const PaStreamCallbackTimeInfo *timeInfo, PaStreamCallbackFlags statusFlags, void *userData) { std::vector<float> *recordedSamples = (std::vector<float> *)userData; const float *input = (const float *)inputBuffer; if (input != nullptr) { for (unsigned long i = 0; i < framesPerBuffer; ++i) { recordedSamples->push_back(input[i]); } } return paContinue; } void captureAudio(std::vector<float> &recordedSamples) { PaStream *stream; PaError err; err = Pa_Initialize(); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n"; return; } err = Pa_OpenDefaultStream(&stream, 1, 0, paFloat32, SAMPLE_RATE, FRAMES_PER_BUFFER, recordCallback, &recordedSamples); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n"; return; } err = Pa_StartStream(stream); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n"; return; } std::cout << "Recording... Press Enter to stop.\n"; std::cin.get(); err = Pa_StopStream(stream); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n"; } err = Pa_CloseStream(stream); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << "\n"; } Pa_Terminate(); }
2. Audio Processing
Using FFTW library for Fast Fourier Transform and MFCC extraction:
#include <fftw3.h> #include <vector> #include <cmath> // Function to compute FFT std::vector<std::complex<double>> computeFFT(const std::vector<float> &signal) { size_t N = signal.size(); std::vector<std::complex<double>> fftResult(N); fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); fftw_plan plan = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE); for (size_t i = 0; i < N; ++i) { in[i][0] = signal[i]; in[i][1] = 0.0; } fftw_execute(plan); for (size_t i = 0; i < N; ++i) { fftResult[i] = std::complex<double>(out[i][0], out[i][1]); } fftw_destroy_plan(plan); fftw_free(in); fftw_free(out); return fftResult; } // Function to compute MFCCs std::vector<double> computeMFCC(const std::vector<float> &signal) { // Placeholder for MFCC computation std::vector<double> mfcc; // TODO: Implement MFCC extraction algorithm return mfcc; }
3. Speech Recognition Algorithm
Using a pre-trained model (e.g., from a library like PocketSphinx):
#include <pocketsphinx.h> void recognizeSpeech(const std::vector<float> &recordedSamples) { ps_decoder_t *ps; cmd_ln_t *config; config = cmd_ln_init(nullptr, ps_args(), TRUE, "-hmm", "model/en-us/en-us", "-lm", "model/en-us/en-us.lm.bin", "-dict", "model/en-us/cmudict-en-us.dict", nullptr); if (config == nullptr) { std::cerr << "Failed to create config object, see log for details\n"; return; } ps = ps_init(config); if (ps == nullptr) { std::cerr << "Failed to create recognizer, see log for details\n"; return; } ps_start_utt(ps); // Convert recorded samples to int16 format std::vector<int16_t> samplesInt16(recordedSamples.size()); for (size_t i = 0; i < recordedSamples.size(); ++i) { samplesInt16[i] = static_cast<int16_t>(recordedSamples[i] * 32767); } ps_process_raw(ps, samplesInt16.data(), samplesInt16.size(), FALSE, FALSE); ps_end_utt(ps); const char *hyp = ps_get_hyp(ps, nullptr); if (hyp != nullptr) { std::cout << "Recognized: " << hyp << "\n"; } else { std::cout << "No speech recognized\n"; } ps_free(ps); cmd_ln_free_r(config); }
4. Main Function
Combining all parts:
int main() { std::vector<float> recordedSamples; captureAudio(recordedSamples); std::cout << "Processing audio...\n"; auto fftResult = computeFFT(recordedSamples); auto mfcc = computeMFCC(recordedSamples); std::cout << "Recognizing speech...\n"; recognizeSpeech(recordedSamples); return 0; }