add similarity functions

This commit is contained in:
Nikola Petrov 2025-01-29 23:08:47 +01:00
parent 2c9e13baaf
commit 609958bc69
2 changed files with 130 additions and 0 deletions

View File

@ -0,0 +1,16 @@
#include "Dna.hpp"
#include <vector>
namespace Similarity
{
// float euclidean_distance(Dna *d1, Dna *d2); direct distance betwen vector. wont give 0 and 1
// float dot_product(Dna *d1, Dna *d2); doent return betwen 0 to 1
float cosine_similarity(Dna *d1, Dna *d2);
float hamming_distance(Dna *d1, Dna *d2);
float jaccard_index(Dna *d1, Dna *d2);
float levenshtein_distance(Dna *d1, Dna *d2);
// float needleman_wunsch(Dna *d1, Dna *d2); used for bioinformatics and aligment. Dont need its aligned alredy
typedef float(simil_func)(Dna *d1, Dna *d2);
float calc_similarity(std::vector<Dna> vec, simil_func f);
}

View File

@ -0,0 +1,114 @@
#include "values/Similarity.hpp"
#include <cmath>
#define MATCH 1
#define MISMATCH -1
#define GAP -2
namespace Similarity
{
float cosine_similarity(Dna *d1, Dna *d2)
{
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
float mag1 = 0.0f;
float mag2 = 0.0f;
float dot_prod = 0.0f;
for (size_t i = 0; i < sizeof(Dna); i++)
{
dot_prod += d1a[i] * d2a[i];
mag1 += d1a[i] * d1a[i];
mag2 += d2a[i] * d2a[i];
}
mag1 = sqrt(mag1);
mag2 = sqrt(mag2);
return dot_prod / (mag1 * mag2);
}
float hamming_distance(Dna *d1, Dna *d2)
{
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
float distance = 0;
for (size_t i = 0; i < sizeof(Dna); i++)
{
if (d1a[i] != d2a[i])
{
distance++;
}
}
return 1 - (distance / sizeof(Dna));
}
float jaccard_index(Dna *d1, Dna *d2)
{
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
size_t intersection = 0;
size_t union_size = sizeof(Dna) + sizeof(Dna);
for (size_t i = 0; i < sizeof(Dna); i++)
{
for (size_t j = 0; j < sizeof(Dna); j++)
{
if (d1a[i] == d2a[j])
{
intersection++;
break;
}
}
}
union_size -= intersection;
return (float)intersection / union_size;
}
float levenshtein_distance(Dna *d1, Dna *d2)
{
auto min = [](uint8_t a, uint8_t b, uint8_t c) -> uint8_t
{
return (a < b ? (a < c ? a : c) : (b < c ? b : c));
};
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
float matrix[sizeof(Dna) + 1][sizeof(Dna) + 1];
for (size_t i = 0; i <= sizeof(Dna); i++)
{
matrix[i][0] = i;
}
for (size_t j = 0; j <= sizeof(Dna); j++)
{
matrix[0][j] = j;
}
for (size_t i = 1; i <= sizeof(Dna); i++)
{
for (size_t j = 1; j <= sizeof(Dna); j++)
{
uint8_t cost = (d1a[i - 1] == d2a[j - 1]) ? 0 : 1;
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
}
}
float ld = matrix[sizeof(Dna)][sizeof(Dna)];
return 1 - (ld / sizeof(Dna));
}
float calc_similarity(std::vector<Dna> vec, simil_func f)
{
size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2;
float total_similarity = 0.0;
for (size_t i = 0; i < vec.size(); i++)
{
for (size_t j = i + 1; j < vec.size(); j++)
{
total_similarity += f(&vec[i], &vec[j]);
}
}
float average_similarity = total_similarity / num_pairs;
return average_similarity;
}
}