import os
import re
import glob
import librosa
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import kagglehub
import glob
# Downloading dataset
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")
print("Path to dataset files:", path)
Path to dataset files: /Users/Andrej/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1
print("music genres in the dataset:", list(os.listdir(f'{path}/Data/genres_original/')))
print("number of music genres:", len(list(os.listdir(f'{path}/Data/genres_original/'))))
music genres in the dataset: ['pop', 'metal', 'disco', 'blues', 'reggae', 'classical', 'rock', 'hiphop', 'country', 'jazz'] number of music genres: 10
directory = f'{path}/Data/genres_original/'
# Initialize a variable to store the minimum length
min_length = float('inf')
number_files = 0
# Iterate over all files in the directory
for genre in os.listdir(directory):
genre_path = os.path.join(directory, genre)
# Ensure it's a directory
if os.path.isdir(genre_path):
for filename in os.listdir(genre_path):
if filename.endswith('.wav'):
file_path = os.path.join(genre_path, filename)
number_files += 1
# Load the audio file using librosa
y, sr = librosa.load(file_path)
# Update the minimum length if needed
min_length = min(min_length, len(y))
print(f"The minimum length of the music time series across all files is: {min_length}, it's corresponds to a minimum of {min_length/22050} seconds")
print("Number of files:", number_files)
The minimum length of the music time series across all files is: 660000, it's corresponds to a minimum of 29.931972789115648 seconds Number of files: 999
# Defining the directory path
directory = f'{path}/Data/genres_original/'
len_dataset = 0
dataset_filenames = []
# Iterating over all files in the directory
for genre in os.listdir(directory):
genre_path = os.path.join(directory, genre)
if os.path.isdir(genre_path):
for filename in os.listdir(genre_path):
if filename.endswith('.wav'):
file_path = os.path.join(genre_path, filename)
for i in range(min_length//1024):
dataset_filenames.append([file_path, i])
def extract_string(input_str):
# Regular expression to capture the string between the first two slashes
match ='/([^/]+)/', input_str)
if match:
return None
Parameters and Configuration¶
AUDIO_DIR = f'{path}/Data/genres_original/' # Directory containing audio files
N_FFT = 1024
N_MELS = 64 # Frequency bins for the Mel-spectrogram
DURATION = 10.0 # Duration of each audio clip in seconds (30s is the entire clip)
GENRES = ['blues','classical','country','disco','hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'] # Example classes
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps")
# ConvRBM Hyperparameters
VISIBLE_CHANNELS = 1 # Spectrogram is treated as a single "image" channel
HIDDEN_CHANNELS = 16 # Number of convolutional filters
KERNEL_SIZE = (8, 8)
CD_K = 1 # Contrastive divergence steps
EPOCHS = 1000
class AudioDataset(Dataset):
def __init__(self, audio_dir, genres, duration=10.0, n_mels=64): = []
self.labels = []
self._load_audio_files(audio_dir, genres, duration, n_mels)
def _load_audio_files(self, audio_dir, genres, duration, n_mels):
for i, genre in enumerate(genres):
genre_path = os.path.join(audio_dir, genre)
for fname in glob.glob(os.path.join(genre_path, "*.wav")):
audio, sr = librosa.load(
# Compute Mel-Spectrogram
mel_spec = librosa.feature.melspectrogram(
# Convert to log scale for better dynamic range
log_mel = librosa.power_to_db(mel_spec, ref=np.max)
# Normalize
log_mel -= log_mel.min()
log_mel /= log_mel.max()
def __len__(self):
return len(
def __getitem__(self, idx):
mel_tensor = torch.tensor([idx], dtype=torch.float32).unsqueeze(0)
label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
return mel_tensor, label_tensor
def split_dataset(audio_dir=AUDIO_DIR, genres=GENRES, n_mels=N_MELS, test_size=0.1, val_size=0.1, random_seed=14):
dataset = AudioDataset(audio_dir, genres, n_mels=n_mels)
# Splitting train and test sets
total_size = len(dataset)
indices = list(range(total_size))
train_indices, test_indices = train_test_split(
indices, test_size=test_size, random_state=random_seed)
# Further split the training indices into training and validation
train_indices, val_indices = train_test_split(
train_indices, test_size=val_size / (1 - test_size), random_state=random_seed)
train_subset = Subset(dataset, train_indices)
val_subset = Subset(dataset, val_indices)
test_subset = Subset(dataset, test_indices)
return train_subset, val_subset, test_subset
def get_dataloaders(audio_dir, genres=GENRES, n_mels=64, batch_size=BATCH_SIZE, test_size=0.1, val_size=0.1, shuffle=True, random_seed=13):
train_subset, val_subset, test_subset = split_dataset(
audio_dir, genres, test_size=test_size, val_size=val_size, n_mels=n_mels, random_seed=random_seed)
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=shuffle)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)
return train_loader, val_loader, test_loader
train_loader_64, val_loader_64, test_loader_64 = get_dataloaders(
print("Number of training batches:", len(train_loader_64))
print("Number of validation batches:", len(val_loader_64))
print("Number of test batches:", len(test_loader_64))
Number of training batches: 13 Number of validation batches: 2 Number of test batches: 2
Vizualizing audio recordings¶
def visualize_spectrogram(loader, sample_index=0):
Visualize a spectrogram from the given DataLoader.
loader (DataLoader): DataLoader containing the spectrograms.
sample_index (int): Index of the sample to visualize within the first batch.
for batch_X, batch_y in loader:
print("batch_X shape:", batch_X.shape)
spectrogram = batch_X[sample_index].squeeze(0).cpu().numpy()
label = GENRES[batch_y[sample_index].item()]
plt.figure(figsize=(10, 6))
plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='viridis')
plt.title(f"Spectrogram for Sample {sample_index} (Label: {label})")
for i in [0, 3, 5, 9]:
visualize_spectrogram(train_loader_64, sample_index=i)
batch_X shape: torch.Size([64, 1, 96, 431])
batch_X shape: torch.Size([64, 1, 96, 431])
batch_X shape: torch.Size([64, 1, 96, 431])
batch_X shape: torch.Size([64, 1, 96, 431])
ConvRBM model¶
class ConvRBM(nn.Module):
def __init__(self, visible_channels, hidden_channels, kernel_size=KERNEL_SIZE, learning_rate=1e-3, cd_k=1, batch_size=BATCH_SIZE):
super(ConvRBM, self).__init__()
self.visible_channels = visible_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.cd_k = cd_k
self.batch_size = batch_size
# Weights & biases
self.W = nn.Parameter(torch.randn(hidden_channels, visible_channels, kernel_size[0], kernel_size[1]) * 0.01)
self.v_bias = nn.Parameter(torch.zeros(1))
self.h_bias = nn.Parameter(torch.zeros(hidden_channels, 1, 1)) # per hidden channel bias
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
def sample_h(self, v):
# Convolution to compute hidden probabilities
# v: [batch, visible_channels, height, width]
conv_v = nn.functional.conv2d(v, self.W, bias=None)
h_lin = conv_v + self.h_bias
h_prob = torch.sigmoid(h_lin)
h_sample = torch.bernoulli(h_prob)
return h_prob, h_sample
def sample_v(self, h):
# Transposing convolution to reconstruct visible
# h: [batch, hidden_channels, h_out, w_out]
deconv_h = nn.functional.conv_transpose2d(h, self.W, bias=None)
v_lin = deconv_h + self.v_bias
v_prob = torch.sigmoid(v_lin)
v_sample = torch.bernoulli(v_prob)
return v_prob, v_sample
def forward(self, v):
# do a forward pass to get hidden probabilities
h_prob, _ = self.sample_h(v)
return h_prob
# Contrastive Divergence - simplified version
def contrastive_divergence(self, v):
# Positive phase
h_prob, h_sample = self.sample_h(v)
# Negative phase
v_neg = v
for _ in range(self.cd_k):
v_prob_neg, v_sample_neg = self.sample_v(h_sample)
h_prob_neg, h_sample_neg = self.sample_h(v_sample_neg)
v_neg = v_sample_neg
h_sample = h_sample_neg
# Reconstruction loss
recon_error = torch.mean((v - v_prob_neg) ** 2)
return recon_error.item()
def transform(self, v, approach="flatten"):
Transforms the input tensor `v` by applying the ConvRBM transformation and global average pooling
to output a feature vector of shape [batch_size, n_mels].
with torch.no_grad():
h_prob, _ = self.sample_h(v)
if approach == "global_average_pooling":
pooled_h_prob = torch.mean(h_prob, dim=3)
pooled_h_prob = torch.mean(pooled_h_prob, dim=2)
elif approach == "max_pooling":
pooled_h_prob = nn.functional.max_pool2d(h_prob, kernel_size=(2, 2), stride=(2, 2))
elif approach == "global_max_pooling":
pooled_h_prob = torch.max(h_prob, dim=3).values
pooled_h_prob = torch.max(pooled_h_prob, dim=2).values
elif approach == "avg_pool2d":
pooled_h_prob = nn.functional.avg_pool2d(h_prob, kernel_size=(2, 2), stride=(2, 2))
pooled_h_prob = h_prob.view(v.size(0), -1)
return pooled_h_prob.view(v.size(0), -1)
def train_rbm(self, train_loader, val_loader, epochs=10, device='cpu', save_interval=50, save_dir='./model_weights'):
"""Train the RBM on input data using the provided dataloaders for training and testing."""
train_losses = []
val_losses = []
train_size = len(train_loader.dataset)
val_size = len(val_loader.dataset)
print(f"Training set size: {train_size}, Validation set size: {val_size}")
# Ensure the save directory exists
os.makedirs(save_dir, exist_ok=True)
for epoch in range(epochs):
train_loss = 0.0
with tqdm(train_loader, unit=" batches") as pbar:
for batch_X, _ in pbar:
batch_X =
loss = self.contrastive_divergence(batch_X)
train_loss += loss
# Average training loss across all batches
avg_train_loss = train_loss / len(train_loader)
val_loss = 0.0
with torch.no_grad():
for batch_X, _ in val_loader:
batch_X =
loss = self.contrastive_divergence(batch_X)
val_loss += loss
avg_val_loss = val_loss / len(val_loader)
# Logging epoch summary
print(f"Epoch [{epoch + 1}/{epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
# Save model weights at intervals
if (epoch + 1) % save_interval == 0:
save_path = os.path.join(save_dir, f'rbm_epoch_{epoch + 1}.pth'), save_path)
print(f"Model weights saved to {save_path}")
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(epoch + 1), train_losses, label='Training Loss')
plt.plot(range(epoch + 1), val_losses, label='Validation Loss')
plt.title('Training and Validation Loss Over Epochs')
# saving the plot
plt.savefig(f'{save_dir}/loss_plot_{epoch + 1}.png')
Instantiating ConvRBM¶
conv_rbm_64 = ConvRBM(
Training ConvRBM¶
Training set size: 799, Validation set size: 100
Epoch [1/20] - Train Loss: 0.0398, Val Loss: 0.0338
Epoch [2/20] - Train Loss: 0.0323, Val Loss: 0.0294 Model weights saved to ./model_weights/rbm_epoch_2.pth
Epoch [3/20] - Train Loss: 0.0302, Val Loss: 0.0284
Epoch [4/20] - Train Loss: 0.0296, Val Loss: 0.0279 Model weights saved to ./model_weights/rbm_epoch_4.pth
Epoch [5/20] - Train Loss: 0.0290, Val Loss: 0.0271
Epoch [6/20] - Train Loss: 0.0280, Val Loss: 0.0260 Model weights saved to ./model_weights/rbm_epoch_6.pth
Epoch [7/20] - Train Loss: 0.0266, Val Loss: 0.0244
Epoch [8/20] - Train Loss: 0.0246, Val Loss: 0.0223 Model weights saved to ./model_weights/rbm_epoch_8.pth
Epoch [9/20] - Train Loss: 0.0219, Val Loss: 0.0196
Epoch [10/20] - Train Loss: 0.0190, Val Loss: 0.0171 Model weights saved to ./model_weights/rbm_epoch_10.pth
Epoch [11/20] - Train Loss: 0.0164, Val Loss: 0.0150
Epoch [12/20] - Train Loss: 0.0146, Val Loss: 0.0138 Model weights saved to ./model_weights/rbm_epoch_12.pth
Epoch [13/20] - Train Loss: 0.0135, Val Loss: 0.0132
Epoch [14/20] - Train Loss: 0.0131, Val Loss: 0.0130 Model weights saved to ./model_weights/rbm_epoch_14.pth
Epoch [15/20] - Train Loss: 0.0128, Val Loss: 0.0128
Epoch [16/20] - Train Loss: 0.0127, Val Loss: 0.0127 Model weights saved to ./model_weights/rbm_epoch_16.pth
Epoch [17/20] - Train Loss: 0.0127, Val Loss: 0.0127
Epoch [18/20] - Train Loss: 0.0126, Val Loss: 0.0127 Model weights saved to ./model_weights/rbm_epoch_18.pth
Epoch [19/20] - Train Loss: 0.0126, Val Loss: 0.0127
Epoch [20/20] - Train Loss: 0.0126, Val Loss: 0.0126 Model weights saved to ./model_weights/rbm_epoch_20.pth
Feature Extraction using ConvRBM¶
# Load the saved model weights for epoch 8 based on the training logs
train_features = {}
test_features = {}
approaches = ["flatten", "global_average_pooling", "max_pooling", "global_max_pooling", "avg_pool2d"]
for approach in approaches:
print(f"Processing features for approach: {approach}")
train_batch = []
test_batch = []
with torch.no_grad():
# Process training features
for train_X, _ in train_loader_64:
train_X =
f = conv_rbm_64.transform(train_X, approach=approach)
train_features[approach] = np.concatenate(train_batch, axis=0)
# Process testing features
for test_X, _ in test_loader_64:
test_X =
f = conv_rbm_64.transform(test_X, approach=approach)
# Store all testing features as a single big list in the dictionary
test_features[approach] = np.concatenate(test_batch, axis=0)
Processing features for approach: flatten Processing features for approach: global_average_pooling Processing features for approach: max_pooling Processing features for approach: global_max_pooling Processing features for approach: avg_pool2d
# Extract labels for the train and test datasets
train_labels = []
test_labels = []
for _, label in train_loader_64.dataset:
for _, label in test_loader_64.dataset:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
# random forest classifier
for approach in train_features.keys():
dt_clf = RandomForestClassifier(max_depth=15, random_state=13)[approach], train_labels)
test_predictions = dt_clf.predict(test_features[approach])
accuracy = accuracy_score(test_labels, test_predictions)
print(f"Decision Tree Classifier Accuracy using {approach} pooling: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, test_predictions))
Decision Tree Classifier Accuracy using flatten pooling: 0.5200 Classification Report: precision recall f1-score support 0 0.56 0.62 0.59 8 1 0.44 0.80 0.57 5 2 0.71 0.45 0.56 11 3 0.33 0.50 0.40 6 4 0.56 0.45 0.50 11 5 0.64 0.50 0.56 14 6 0.87 0.76 0.81 17 7 0.33 0.67 0.44 6 8 0.33 0.33 0.33 12 9 0.29 0.20 0.24 10 accuracy 0.52 100 macro avg 0.51 0.53 0.50 100 weighted avg 0.55 0.52 0.52 100 Decision Tree Classifier Accuracy using global_average_pooling pooling: 0.2100 Classification Report: precision recall f1-score support 0 0.08 0.12 0.10 8 1 0.22 0.40 0.29 5 2 0.00 0.00 0.00 11 3 0.08 0.17 0.11 6 4 0.18 0.18 0.18 11 5 0.27 0.21 0.24 14 6 0.70 0.41 0.52 17 7 0.29 0.33 0.31 6 8 0.23 0.25 0.24 12 9 0.00 0.00 0.00 10 accuracy 0.21 100 macro avg 0.21 0.21 0.20 100 weighted avg 0.24 0.21 0.22 100 Decision Tree Classifier Accuracy using max_pooling pooling: 0.4900 Classification Report: precision recall f1-score support 0 0.56 0.62 0.59 8 1 0.44 0.80 0.57 5 2 0.57 0.36 0.44 11 3 0.09 0.17 0.12 6 4 0.67 0.36 0.47 11 5 0.88 0.50 0.64 14 6 0.72 0.76 0.74 17 7 0.33 0.50 0.40 6 8 0.36 0.42 0.38 12 9 0.33 0.30 0.32 10 accuracy 0.49 100 macro avg 0.50 0.48 0.47 100 weighted avg 0.55 0.49 0.50 100 Decision Tree Classifier Accuracy using global_max_pooling pooling: 0.3400 Classification Report: precision recall f1-score support 0 0.27 0.38 0.32 8 1 0.25 0.60 0.35 5 2 0.25 0.18 0.21 11 3 0.11 0.17 0.13 6 4 0.25 0.09 0.13 11 5 0.62 0.57 0.59 14 6 0.67 0.35 0.46 17 7 0.12 0.17 0.14 6 8 0.36 0.42 0.38 12 9 0.33 0.40 0.36 10 accuracy 0.34 100 macro avg 0.32 0.33 0.31 100 weighted avg 0.38 0.34 0.34 100 Decision Tree Classifier Accuracy using avg_pool2d pooling: 0.5300 Classification Report: precision recall f1-score support 0 0.56 0.62 0.59 8 1 0.42 1.00 0.59 5 2 0.75 0.27 0.40 11 3 0.23 0.50 0.32 6 4 0.80 0.73 0.76 11 5 0.71 0.36 0.48 14 6 0.81 0.76 0.79 17 7 0.40 0.67 0.50 6 8 0.42 0.42 0.42 12 9 0.29 0.20 0.24 10 accuracy 0.53 100 macro avg 0.54 0.55 0.51 100 weighted avg 0.59 0.53 0.53 100
t-SNE visualization
from sklearn.manifold import TSNE
import seaborn as sns
import pandas as pd
def plot_tsne(features, labels):
tsne = TSNE(n_components=2, random_state=13)
tsne_features = tsne.fit_transform(features)
tsne_df = pd.DataFrame(tsne_features, columns=['Component 1', 'Component 2'])
tsne_df['Label'] = labels
plt.figure(figsize=(12, 8))
sns.scatterplot(data=tsne_df, x='Component 1', y='Component 2', hue='Label', palette='tab10', alpha=0.6)
plt.title('t-SNE Visualization of Audio Features')
plot_tsne(train_features["avg_pool2d"], train_labels)