Speech Emotion Recognition
Abstract
Speech Emotion Recognition is a Python project that uses AI to recognize emotions from speech. The application features audio processing, model training, and evaluation, demonstrating best practices in NLP and affective computing.
Prerequisites
- Python 3.8 or above
- A code editor or IDE
- Basic understanding of audio processing and ML
- Required libraries:
librosa
librosa
,scikit-learn
scikit-learn
,numpy
numpy
,matplotlib
matplotlib
Before you Start
Install Python and the required libraries:
Install dependencies
pip install librosa scikit-learn numpy matplotlib
Install dependencies
pip install librosa scikit-learn numpy matplotlib
Getting Started
Create a Project
- Create a folder named
speech-emotion-recognition
speech-emotion-recognition
. - Open the folder in your code editor or IDE.
- Create a file named
speech_emotion_recognition.py
speech_emotion_recognition.py
. - Copy the code below into your file.
Write the Code
⚙️ Speech Emotion Recognition
Speech Emotion Recognition
"""
Speech Emotion Recognition
This project detects emotions from speech audio using machine learning. It demonstrates feature extraction (MFCC), model training, prediction, and reporting using scikit-learn. Includes CLI for training and prediction.
"""
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os
import argparse
import joblib
def extract_features(file_path):
try:
y, sr = librosa.load(file_path, duration=3, offset=0.5)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print(f"Error processing {file_path}: {e}")
return np.zeros(40)
def load_data(data_folder):
X, y = [], []
for file in os.listdir(data_folder):
if file.endswith('.wav'):
label = file.split('_')[0] # e.g., happy_01.wav
features = extract_features(os.path.join(data_folder, file))
X.append(features)
y.append(label)
return np.array(X), np.array(y)
def train_model(X, y, model_path=None):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Accuracy: {clf.score(X_test, y_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
if model_path:
joblib.dump(clf, model_path)
print(f"Model saved to {model_path}")
return clf
def predict_emotion(model, file_path):
features = extract_features(file_path)
pred = model.predict([features])[0]
print(f"Predicted emotion for {file_path}: {pred}")
return pred
def main():
parser = argparse.ArgumentParser(description="Speech Emotion Recognition")
parser.add_argument('--data', type=str, help='Path to audio data folder')
parser.add_argument('--train', action='store_true', help='Train model')
parser.add_argument('--model', type=str, default='ser_model.pkl', help='Path to save/load model')
parser.add_argument('--predict', type=str, help='Path to audio file for prediction')
args = parser.parse_args()
if args.train and args.data:
X, y = load_data(args.data)
train_model(X, y, args.model)
elif args.predict:
if not os.path.exists(args.model):
print(f"Model file {args.model} not found. Train the model first.")
return
model = joblib.load(args.model)
predict_emotion(model, args.predict)
else:
parser.print_help()
if __name__ == "__main__":
main()
Speech Emotion Recognition
"""
Speech Emotion Recognition
This project detects emotions from speech audio using machine learning. It demonstrates feature extraction (MFCC), model training, prediction, and reporting using scikit-learn. Includes CLI for training and prediction.
"""
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os
import argparse
import joblib
def extract_features(file_path):
try:
y, sr = librosa.load(file_path, duration=3, offset=0.5)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print(f"Error processing {file_path}: {e}")
return np.zeros(40)
def load_data(data_folder):
X, y = [], []
for file in os.listdir(data_folder):
if file.endswith('.wav'):
label = file.split('_')[0] # e.g., happy_01.wav
features = extract_features(os.path.join(data_folder, file))
X.append(features)
y.append(label)
return np.array(X), np.array(y)
def train_model(X, y, model_path=None):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Accuracy: {clf.score(X_test, y_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
if model_path:
joblib.dump(clf, model_path)
print(f"Model saved to {model_path}")
return clf
def predict_emotion(model, file_path):
features = extract_features(file_path)
pred = model.predict([features])[0]
print(f"Predicted emotion for {file_path}: {pred}")
return pred
def main():
parser = argparse.ArgumentParser(description="Speech Emotion Recognition")
parser.add_argument('--data', type=str, help='Path to audio data folder')
parser.add_argument('--train', action='store_true', help='Train model')
parser.add_argument('--model', type=str, default='ser_model.pkl', help='Path to save/load model')
parser.add_argument('--predict', type=str, help='Path to audio file for prediction')
args = parser.parse_args()
if args.train and args.data:
X, y = load_data(args.data)
train_model(X, y, args.model)
elif args.predict:
if not os.path.exists(args.model):
print(f"Model file {args.model} not found. Train the model first.")
return
model = joblib.load(args.model)
predict_emotion(model, args.predict)
else:
parser.print_help()
if __name__ == "__main__":
main()
Example Usage
Run emotion recognition
python speech_emotion_recognition.py
Run emotion recognition
python speech_emotion_recognition.py
Explanation
Key Features
- Audio Processing: Extracts features from speech audio.
- Model Training: Trains a model to recognize emotions.
- Evaluation: Assesses model performance.
- Error Handling: Validates inputs and manages exceptions.
Code Breakdown
- Import Libraries and Setup Data
speech_emotion_recognition.py
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
speech_emotion_recognition.py
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
- Audio Processing and Model Training Functions
speech_emotion_recognition.py
def extract_features(audio_path):
y, sr = librosa.load(audio_path)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
return mfccs
def train_model(X, y):
model = RandomForestClassifier()
model.fit(X, y)
return model
speech_emotion_recognition.py
def extract_features(audio_path):
y, sr = librosa.load(audio_path)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
return mfccs
def train_model(X, y):
model = RandomForestClassifier()
model.fit(X, y)
return model
- Evaluation and Error Handling
speech_emotion_recognition.py
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
def main():
print("Speech Emotion Recognition")
# X = [...] # Extracted features
# y = [...] # Labels
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# model = train_model(X_train, y_train)
# evaluate_model(model, X_test, y_test)
print("[Demo] Recognition logic here.")
if __name__ == "__main__":
main()
speech_emotion_recognition.py
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
def main():
print("Speech Emotion Recognition")
# X = [...] # Extracted features
# y = [...] # Labels
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# model = train_model(X_train, y_train)
# evaluate_model(model, X_test, y_test)
print("[Demo] Recognition logic here.")
if __name__ == "__main__":
main()
Features
- Emotion Recognition: Audio processing, model training, and evaluation
- Modular Design: Separate functions for each task
- Error Handling: Manages invalid inputs and exceptions
- Production-Ready: Scalable and maintainable code
Next Steps
Enhance the project by:
- Integrating with real emotion datasets
- Supporting advanced ML models
- Creating a GUI for recognition
- Adding real-time analytics
- Unit testing for reliability
Educational Value
This project teaches:
- Affective Computing: Emotion recognition and ML
- Software Design: Modular, maintainable code
- Error Handling: Writing robust Python code
Real-World Applications
- Voice Assistants
- Customer Support Analytics
- AI Tools
Conclusion
Speech Emotion Recognition demonstrates how to build a scalable and accurate emotion recognition tool using Python. With modular design and extensibility, this project can be adapted for real-world applications in affective computing, analytics, and more. For more advanced projects, visit Python Central Hub.
Was this page helpful?
Let us know how we did