Speech Emotion Recognition

Abstract

Speech Emotion Recognition is a Python project that uses AI to recognize emotions from speech. The application features audio processing, model training, and evaluation, demonstrating best practices in NLP and affective computing.

Prerequisites

Python 3.8 or above
A code editor or IDE
Basic understanding of audio processing and ML
Required libraries: librosalibrosa, scikit-learnscikit-learn, numpynumpy, matplotlibmatplotlib

Before you Start

Install Python and the required libraries:

Install dependencies

pip install librosa scikit-learn numpy matplotlib

Install dependencies

pip install librosa scikit-learn numpy matplotlib

Getting Started

Create a Project

Create a folder named speech-emotion-recognitionspeech-emotion-recognition.
Open the folder in your code editor or IDE.
Create a file named speech_emotion_recognition.pyspeech_emotion_recognition.py.
Copy the code below into your file.

Write the Code

⚙️ Speech Emotion Recognition

Speech Emotion Recognition

"""
Speech Emotion Recognition
 
This project detects emotions from speech audio using machine learning. It demonstrates feature extraction (MFCC), model training, prediction, and reporting using scikit-learn. Includes CLI for training and prediction.
"""
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os
import argparse
import joblib
 
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, duration=3, offset=0.5)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        return mfccs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.zeros(40)
 
def load_data(data_folder):
    X, y = [], []
    for file in os.listdir(data_folder):
        if file.endswith('.wav'):
            label = file.split('_')[0]  # e.g., happy_01.wav
            features = extract_features(os.path.join(data_folder, file))
            X.append(features)
            y.append(label)
    return np.array(X), np.array(y)
 
def train_model(X, y, model_path=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Accuracy: {clf.score(X_test, y_test):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    if model_path:
        joblib.dump(clf, model_path)
        print(f"Model saved to {model_path}")
    return clf
 
def predict_emotion(model, file_path):
    features = extract_features(file_path)
    pred = model.predict([features])[0]
    print(f"Predicted emotion for {file_path}: {pred}")
    return pred
 
def main():
    parser = argparse.ArgumentParser(description="Speech Emotion Recognition")
    parser.add_argument('--data', type=str, help='Path to audio data folder')
    parser.add_argument('--train', action='store_true', help='Train model')
    parser.add_argument('--model', type=str, default='ser_model.pkl', help='Path to save/load model')
    parser.add_argument('--predict', type=str, help='Path to audio file for prediction')
    args = parser.parse_args()
 
    if args.train and args.data:
        X, y = load_data(args.data)
        train_model(X, y, args.model)
    elif args.predict:
        if not os.path.exists(args.model):
            print(f"Model file {args.model} not found. Train the model first.")
            return
        model = joblib.load(args.model)
        predict_emotion(model, args.predict)
    else:
        parser.print_help()
 
if __name__ == "__main__":
    main()

Speech Emotion Recognition

"""
Speech Emotion Recognition
 
This project detects emotions from speech audio using machine learning. It demonstrates feature extraction (MFCC), model training, prediction, and reporting using scikit-learn. Includes CLI for training and prediction.
"""
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os
import argparse
import joblib
 
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, duration=3, offset=0.5)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        return mfccs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.zeros(40)
 
def load_data(data_folder):
    X, y = [], []
    for file in os.listdir(data_folder):
        if file.endswith('.wav'):
            label = file.split('_')[0]  # e.g., happy_01.wav
            features = extract_features(os.path.join(data_folder, file))
            X.append(features)
            y.append(label)
    return np.array(X), np.array(y)
 
def train_model(X, y, model_path=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Accuracy: {clf.score(X_test, y_test):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    if model_path:
        joblib.dump(clf, model_path)
        print(f"Model saved to {model_path}")
    return clf
 
def predict_emotion(model, file_path):
    features = extract_features(file_path)
    pred = model.predict([features])[0]
    print(f"Predicted emotion for {file_path}: {pred}")
    return pred
 
def main():
    parser = argparse.ArgumentParser(description="Speech Emotion Recognition")
    parser.add_argument('--data', type=str, help='Path to audio data folder')
    parser.add_argument('--train', action='store_true', help='Train model')
    parser.add_argument('--model', type=str, default='ser_model.pkl', help='Path to save/load model')
    parser.add_argument('--predict', type=str, help='Path to audio file for prediction')
    args = parser.parse_args()
 
    if args.train and args.data:
        X, y = load_data(args.data)
        train_model(X, y, args.model)
    elif args.predict:
        if not os.path.exists(args.model):
            print(f"Model file {args.model} not found. Train the model first.")
            return
        model = joblib.load(args.model)
        predict_emotion(model, args.predict)
    else:
        parser.print_help()
 
if __name__ == "__main__":
    main()

Example Usage

Run emotion recognition

python speech_emotion_recognition.py

Run emotion recognition

python speech_emotion_recognition.py

Explanation

Key Features

Audio Processing: Extracts features from speech audio.
Model Training: Trains a model to recognize emotions.
Evaluation: Assesses model performance.
Error Handling: Validates inputs and manages exceptions.

Code Breakdown

Import Libraries and Setup Data

speech_emotion_recognition.py

import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

speech_emotion_recognition.py

import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

Audio Processing and Model Training Functions

speech_emotion_recognition.py

def extract_features(audio_path):
    y, sr = librosa.load(audio_path)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
    return mfccs
 
def train_model(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

speech_emotion_recognition.py

def extract_features(audio_path):
    y, sr = librosa.load(audio_path)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
    return mfccs
 
def train_model(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

Evaluation and Error Handling

speech_emotion_recognition.py

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
 
def main():
    print("Speech Emotion Recognition")
    # X = [...] # Extracted features
    # y = [...] # Labels
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # model = train_model(X_train, y_train)
    # evaluate_model(model, X_test, y_test)
    print("[Demo] Recognition logic here.")
 
if __name__ == "__main__":
    main()

speech_emotion_recognition.py

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
 
def main():
    print("Speech Emotion Recognition")
    # X = [...] # Extracted features
    # y = [...] # Labels
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # model = train_model(X_train, y_train)
    # evaluate_model(model, X_test, y_test)
    print("[Demo] Recognition logic here.")
 
if __name__ == "__main__":
    main()

Features

Emotion Recognition: Audio processing, model training, and evaluation
Modular Design: Separate functions for each task
Error Handling: Manages invalid inputs and exceptions
Production-Ready: Scalable and maintainable code

Next Steps

Enhance the project by:

Integrating with real emotion datasets
Supporting advanced ML models
Creating a GUI for recognition
Adding real-time analytics
Unit testing for reliability

Educational Value

This project teaches:

Affective Computing: Emotion recognition and ML
Software Design: Modular, maintainable code
Error Handling: Writing robust Python code

Real-World Applications

Voice Assistants
Customer Support Analytics
AI Tools

Conclusion

Speech Emotion Recognition demonstrates how to build a scalable and accurate emotion recognition tool using Python. With modular design and extensibility, this project can be adapted for real-world applications in affective computing, analytics, and more. For more advanced projects, visit Python Central Hub.

Speech Emotion Recognition

Abstract

Prerequisites

Before you Start

Getting Started

Create a Project

Write the Code

Example Usage

Explanation

Key Features

Code Breakdown

Features

Next Steps

Educational Value

Real-World Applications

Conclusion

Was this page helpful?