Update similarity calc

This commit is contained in:
2025-03-09 15:14:21 +01:00
parent 1994e7ccb1
commit 266d250881
5 changed files with 73 additions and 56 deletions

View File

@@ -5,13 +5,14 @@ namespace Similarity
{
// float euclidean_distance(Dna *d1, Dna *d2); direct distance betwen vector. wont give 0 and 1
// float dot_product(Dna *d1, Dna *d2); doent return betwen 0 to 1
float cosine_similarity(Dna *d1, Dna *d2);
float cosine_similarity_int(Dna *d1, Dna *d2);
// float cosine_similarity(Dna *d1, Dna *d2);
// float cosine_similarity_int(Dna *d1, Dna *d2);
float hamming_distance(Dna *d1, Dna *d2);
float hamming_distance_without_seeds(Dna *d1, Dna *d2);
// float jaccard_index(Dna *d1, Dna *d2); // primerja unio genov naprimer gleda ce je gen za nebo isti z genom za barvo za liste, to nerabimo
// float levenshtein_distance(Dna *d1, Dna *d2); // odstranjen ker mi vrne iste podatke kot hamming distance ki je bolj enostaven za izracun
// float needleman_wunsch(Dna *d1, Dna *d2); used for bioinformatics and aligment. Dont need its aligned alredy
typedef float(simil_func)(Dna *d1, Dna *d2);
float calc_similarity(std::vector<Dna> &vec, simil_func f);
float calc_similarity(std::vector<Dna> &vec, simil_func f = hamming_distance_without_seeds);
}

View File

@@ -7,50 +7,50 @@ namespace Similarity
// 0 -> -128
// 255 -> 127
// int8_t = uint8_t - 128
float cosine_similarity(Dna *d1, Dna *d2)
{
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
// float cosine_similarity(Dna *d1, Dna *d2)
// {
// uint8_t *d1a = (uint8_t *)d1;
// uint8_t *d2a = (uint8_t *)d2;
float mag1 = 0.0f;
float mag2 = 0.0f;
float dot_prod = 0.0f;
for (size_t i = 0; i < sizeof(Dna); i++)
{
dot_prod += d1a[i] * d2a[i];
mag1 += d1a[i] * d1a[i];
mag2 += d2a[i] * d2a[i];
}
mag1 = sqrt(mag1);
mag2 = sqrt(mag2);
// float mag1 = 0.0f;
// float mag2 = 0.0f;
// float dot_prod = 0.0f;
// for (size_t i = 0; i < sizeof(Dna); i++)
// {
// dot_prod += d1a[i] * d2a[i];
// mag1 += d1a[i] * d1a[i];
// mag2 += d2a[i] * d2a[i];
// }
// mag1 = sqrt(mag1);
// mag2 = sqrt(mag2);
return dot_prod / (mag1 * mag2);
}
// return dot_prod / (mag1 * mag2);
// }
float cosine_similarity_int(Dna *d1, Dna *d2)
{
auto map = [](uint8_t a) -> int8_t
{ return a - 128; };
// float cosine_similarity_int(Dna *d1, Dna *d2)
// {
// auto map = [](uint8_t a) -> int8_t
// { return a - 128; };
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
// uint8_t *d1a = (uint8_t *)d1;
// uint8_t *d2a = (uint8_t *)d2;
float mag1 = 0.0f;
float mag2 = 0.0f;
float dot_prod = 0.0f;
for (size_t i = 0; i < sizeof(Dna); i++)
{
int8_t a = map(d1a[i]);
int8_t b = map(d2a[i]);
dot_prod += a * b;
mag1 += a * a;
mag2 += b * b;
}
mag1 = sqrt(mag1);
mag2 = sqrt(mag2);
// float mag1 = 0.0f;
// float mag2 = 0.0f;
// float dot_prod = 0.0f;
// for (size_t i = 0; i < sizeof(Dna); i++)
// {
// int8_t a = map(d1a[i]);
// int8_t b = map(d2a[i]);
// dot_prod += a * b;
// mag1 += a * a;
// mag2 += b * b;
// }
// mag1 = sqrt(mag1);
// mag2 = sqrt(mag2);
return dot_prod / (mag1 * mag2);
}
// return dot_prod / (mag1 * mag2);
// }
float hamming_distance(Dna *d1, Dna *d2)
{
@@ -67,6 +67,23 @@ namespace Similarity
return 1 - (distance / sizeof(Dna));
}
float hamming_distance_without_seeds(Dna *d1, Dna *d2)
{
constexpr size_t start = sizeof(uint128) * 3;
constexpr size_t end = sizeof(Dna);
uint8_t *d1a = (uint8_t *)d1;
uint8_t *d2a = (uint8_t *)d2;
float distance = 0;
for (size_t i = start; i < end; i++)
{
if (d1a[i] != d2a[i])
{
distance++;
}
}
return 1 - (distance / (end - start));
}
float calc_similarity(std::vector<Dna> &vec, simil_func f)
{
size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2;