add similarity functions
This commit is contained in:
parent
2c9e13baaf
commit
609958bc69
16
shared/inc/values/Similarity.hpp
Normal file
16
shared/inc/values/Similarity.hpp
Normal file
@ -0,0 +1,16 @@
|
||||
#include "Dna.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace Similarity
|
||||
{
|
||||
// float euclidean_distance(Dna *d1, Dna *d2); direct distance betwen vector. wont give 0 and 1
|
||||
// float dot_product(Dna *d1, Dna *d2); doent return betwen 0 to 1
|
||||
float cosine_similarity(Dna *d1, Dna *d2);
|
||||
float hamming_distance(Dna *d1, Dna *d2);
|
||||
float jaccard_index(Dna *d1, Dna *d2);
|
||||
float levenshtein_distance(Dna *d1, Dna *d2);
|
||||
// float needleman_wunsch(Dna *d1, Dna *d2); used for bioinformatics and aligment. Dont need its aligned alredy
|
||||
|
||||
typedef float(simil_func)(Dna *d1, Dna *d2);
|
||||
float calc_similarity(std::vector<Dna> vec, simil_func f);
|
||||
}
|
114
shared/src/values/Similarity.cpp
Normal file
114
shared/src/values/Similarity.cpp
Normal file
@ -0,0 +1,114 @@
|
||||
#include "values/Similarity.hpp"
|
||||
#include <cmath>
|
||||
|
||||
#define MATCH 1
|
||||
#define MISMATCH -1
|
||||
#define GAP -2
|
||||
|
||||
namespace Similarity
|
||||
{
|
||||
|
||||
float cosine_similarity(Dna *d1, Dna *d2)
|
||||
{
|
||||
uint8_t *d1a = (uint8_t *)d1;
|
||||
uint8_t *d2a = (uint8_t *)d2;
|
||||
|
||||
float mag1 = 0.0f;
|
||||
float mag2 = 0.0f;
|
||||
float dot_prod = 0.0f;
|
||||
for (size_t i = 0; i < sizeof(Dna); i++)
|
||||
{
|
||||
dot_prod += d1a[i] * d2a[i];
|
||||
mag1 += d1a[i] * d1a[i];
|
||||
mag2 += d2a[i] * d2a[i];
|
||||
}
|
||||
mag1 = sqrt(mag1);
|
||||
mag2 = sqrt(mag2);
|
||||
|
||||
return dot_prod / (mag1 * mag2);
|
||||
}
|
||||
|
||||
float hamming_distance(Dna *d1, Dna *d2)
|
||||
{
|
||||
uint8_t *d1a = (uint8_t *)d1;
|
||||
uint8_t *d2a = (uint8_t *)d2;
|
||||
float distance = 0;
|
||||
for (size_t i = 0; i < sizeof(Dna); i++)
|
||||
{
|
||||
if (d1a[i] != d2a[i])
|
||||
{
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return 1 - (distance / sizeof(Dna));
|
||||
}
|
||||
|
||||
float jaccard_index(Dna *d1, Dna *d2)
|
||||
{
|
||||
uint8_t *d1a = (uint8_t *)d1;
|
||||
uint8_t *d2a = (uint8_t *)d2;
|
||||
size_t intersection = 0;
|
||||
size_t union_size = sizeof(Dna) + sizeof(Dna);
|
||||
|
||||
for (size_t i = 0; i < sizeof(Dna); i++)
|
||||
{
|
||||
for (size_t j = 0; j < sizeof(Dna); j++)
|
||||
{
|
||||
if (d1a[i] == d2a[j])
|
||||
{
|
||||
intersection++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
union_size -= intersection;
|
||||
return (float)intersection / union_size;
|
||||
}
|
||||
|
||||
float levenshtein_distance(Dna *d1, Dna *d2)
|
||||
{
|
||||
auto min = [](uint8_t a, uint8_t b, uint8_t c) -> uint8_t
|
||||
{
|
||||
return (a < b ? (a < c ? a : c) : (b < c ? b : c));
|
||||
};
|
||||
|
||||
uint8_t *d1a = (uint8_t *)d1;
|
||||
uint8_t *d2a = (uint8_t *)d2;
|
||||
float matrix[sizeof(Dna) + 1][sizeof(Dna) + 1];
|
||||
for (size_t i = 0; i <= sizeof(Dna); i++)
|
||||
{
|
||||
matrix[i][0] = i;
|
||||
}
|
||||
for (size_t j = 0; j <= sizeof(Dna); j++)
|
||||
{
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
for (size_t i = 1; i <= sizeof(Dna); i++)
|
||||
{
|
||||
for (size_t j = 1; j <= sizeof(Dna); j++)
|
||||
{
|
||||
uint8_t cost = (d1a[i - 1] == d2a[j - 1]) ? 0 : 1;
|
||||
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
|
||||
}
|
||||
}
|
||||
float ld = matrix[sizeof(Dna)][sizeof(Dna)];
|
||||
return 1 - (ld / sizeof(Dna));
|
||||
}
|
||||
|
||||
float calc_similarity(std::vector<Dna> vec, simil_func f)
|
||||
{
|
||||
size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2;
|
||||
|
||||
float total_similarity = 0.0;
|
||||
for (size_t i = 0; i < vec.size(); i++)
|
||||
{
|
||||
for (size_t j = i + 1; j < vec.size(); j++)
|
||||
{
|
||||
total_similarity += f(&vec[i], &vec[j]);
|
||||
}
|
||||
}
|
||||
float average_similarity = total_similarity / num_pairs;
|
||||
return average_similarity;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user