From 609958bc6910d01355eb14f3b948bc20ddc2860e Mon Sep 17 00:00:00 2001 From: Nikola Petrov Date: Wed, 29 Jan 2025 23:08:47 +0100 Subject: [PATCH] add similarity functions --- shared/inc/values/Similarity.hpp | 16 +++++ shared/src/values/Similarity.cpp | 114 +++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 shared/inc/values/Similarity.hpp create mode 100644 shared/src/values/Similarity.cpp diff --git a/shared/inc/values/Similarity.hpp b/shared/inc/values/Similarity.hpp new file mode 100644 index 0000000..f257082 --- /dev/null +++ b/shared/inc/values/Similarity.hpp @@ -0,0 +1,16 @@ +#include "Dna.hpp" +#include + +namespace Similarity +{ + // float euclidean_distance(Dna *d1, Dna *d2); direct distance betwen vector. wont give 0 and 1 + // float dot_product(Dna *d1, Dna *d2); doent return betwen 0 to 1 + float cosine_similarity(Dna *d1, Dna *d2); + float hamming_distance(Dna *d1, Dna *d2); + float jaccard_index(Dna *d1, Dna *d2); + float levenshtein_distance(Dna *d1, Dna *d2); + // float needleman_wunsch(Dna *d1, Dna *d2); used for bioinformatics and aligment. Dont need its aligned alredy + + typedef float(simil_func)(Dna *d1, Dna *d2); + float calc_similarity(std::vector vec, simil_func f); +} \ No newline at end of file diff --git a/shared/src/values/Similarity.cpp b/shared/src/values/Similarity.cpp new file mode 100644 index 0000000..197f6ea --- /dev/null +++ b/shared/src/values/Similarity.cpp @@ -0,0 +1,114 @@ +#include "values/Similarity.hpp" +#include + +#define MATCH 1 +#define MISMATCH -1 +#define GAP -2 + +namespace Similarity +{ + + float cosine_similarity(Dna *d1, Dna *d2) + { + uint8_t *d1a = (uint8_t *)d1; + uint8_t *d2a = (uint8_t *)d2; + + float mag1 = 0.0f; + float mag2 = 0.0f; + float dot_prod = 0.0f; + for (size_t i = 0; i < sizeof(Dna); i++) + { + dot_prod += d1a[i] * d2a[i]; + mag1 += d1a[i] * d1a[i]; + mag2 += d2a[i] * d2a[i]; + } + mag1 = sqrt(mag1); + mag2 = sqrt(mag2); + + return dot_prod / (mag1 * mag2); + } + + float hamming_distance(Dna *d1, Dna *d2) + { + uint8_t *d1a = (uint8_t *)d1; + uint8_t *d2a = (uint8_t *)d2; + float distance = 0; + for (size_t i = 0; i < sizeof(Dna); i++) + { + if (d1a[i] != d2a[i]) + { + distance++; + } + } + return 1 - (distance / sizeof(Dna)); + } + + float jaccard_index(Dna *d1, Dna *d2) + { + uint8_t *d1a = (uint8_t *)d1; + uint8_t *d2a = (uint8_t *)d2; + size_t intersection = 0; + size_t union_size = sizeof(Dna) + sizeof(Dna); + + for (size_t i = 0; i < sizeof(Dna); i++) + { + for (size_t j = 0; j < sizeof(Dna); j++) + { + if (d1a[i] == d2a[j]) + { + intersection++; + break; + } + } + } + + union_size -= intersection; + return (float)intersection / union_size; + } + + float levenshtein_distance(Dna *d1, Dna *d2) + { + auto min = [](uint8_t a, uint8_t b, uint8_t c) -> uint8_t + { + return (a < b ? (a < c ? a : c) : (b < c ? b : c)); + }; + + uint8_t *d1a = (uint8_t *)d1; + uint8_t *d2a = (uint8_t *)d2; + float matrix[sizeof(Dna) + 1][sizeof(Dna) + 1]; + for (size_t i = 0; i <= sizeof(Dna); i++) + { + matrix[i][0] = i; + } + for (size_t j = 0; j <= sizeof(Dna); j++) + { + matrix[0][j] = j; + } + for (size_t i = 1; i <= sizeof(Dna); i++) + { + for (size_t j = 1; j <= sizeof(Dna); j++) + { + uint8_t cost = (d1a[i - 1] == d2a[j - 1]) ? 0 : 1; + matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost); + } + } + float ld = matrix[sizeof(Dna)][sizeof(Dna)]; + return 1 - (ld / sizeof(Dna)); + } + + float calc_similarity(std::vector vec, simil_func f) + { + size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2; + + float total_similarity = 0.0; + for (size_t i = 0; i < vec.size(); i++) + { + for (size_t j = i + 1; j < vec.size(); j++) + { + total_similarity += f(&vec[i], &vec[j]); + } + } + float average_similarity = total_similarity / num_pairs; + return average_similarity; + } +}