I have about 30 sound clips that are each a preset from a synthesizer. I want to
compare these sounds to find out which ones are similar, and then sort the sounds so that each sound is adjacent in a list to 2 sounds which are similar to it. Frequency is not the only thing I want to look for. I would rather 2 saw waves which are a tone apart be considered similar that a saw wave and a sine wave which are the same note.
These sounds would be considered similar for example

Below is some code I have written in order to separate a song clip into individual sounds. At the end I have an array named transientSamples which holds the ~30 sounds from the sound file. I would like to sort this list by sound similarity.
Here is the sound file I am using
import librosa
import numpy as np
import os
import soundfile as sf
import pretty_midi
def transients_from_onsets(onset_samples):
"""Takes a list of onset times for an audio file and returns the list of start and stop times for that audio file
Args:
onset_samples ([int]):
Returns:
[(int, int)]: A list of start and stop times for each sound change
"""
starts = onset_samples[0:-1]
stops = onset_samples[1:]
transients = []
for s in range(len(starts)):
transients.append((starts[s], stops[s]))
return transients
def transient_samples_from_times(transientTimes, y):
transientSamples = []
for (start, stop) in transientTimes:
transientSamples.append(y[start, stop])
return transientSamples
def transients_from_sound_file(fileName, sr=44100):
"""Takes the path to an audio file
and returns the list of start and stop times for that audio file
as a frame rate
Args:
fileName (string): The path to an audio file
sr (int, optional): The sample rate of the audio file. Defaults to 44100.
Returns:
[(int, int)]: A list of start and stop times for each sound change
"""
y, sr = librosa.load(soundFile, sr=sr)
C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
transients = transients_from_onsets(onset_samples)
transientSamples = transient_samples_from_times(transientTimes)
return transientTimes, transientSamples
def main():
soundFile = "first-four-seconds.wav"
transientTimes, transientSamples = transients_from_sound_file(soundFile)