#include "values/Similarity.hpp" #include #include #include #include #include namespace Similarity { float dot_minmax(Dna *d1, Dna *d2) { uint64_t max = sizeof(Dna) * 255 * 255; uint8_t *a = (uint8_t *)d1; uint8_t *b = (uint8_t *)d2; uint32_t result = 0; for (size_t i = 0; i < sizeof(Dna); ++i) { result += static_cast(a[i]) * static_cast(b[i]); } return result / (double)max; } float euclidean_distance(Dna *d1, Dna *d2) { uint8_t *a = (uint8_t *)d1; uint8_t *b = (uint8_t *)d2; float sum = 0.0f; for (size_t i = 0; i < sizeof(Dna); ++i) { float diff = static_cast(a[i]) - static_cast(b[i]); sum += diff * diff; } float distance = std::sqrt(sum); float max_distance = 255.0f * std::sqrt(static_cast(sizeof(Dna))); return 1 - (distance / max_distance); } // todo: use int8_t insted of uint8_t and map data // 0 -> -128 // 255 -> 127 // int8_t = uint8_t - 128 float cosine_similarity(Dna *d1, Dna *d2) { uint8_t *d1a = (uint8_t *)d1; uint8_t *d2a = (uint8_t *)d2; float mag1 = 0.0f; float mag2 = 0.0f; float dot_prod = 0.0f; for (size_t i = 0; i < sizeof(Dna); i++) { dot_prod += d1a[i] * d2a[i]; mag1 += d1a[i] * d1a[i]; mag2 += d2a[i] * d2a[i]; } mag1 = sqrt(mag1); mag2 = sqrt(mag2); return dot_prod / (mag1 * mag2); } float cosine_similarity_int(Dna *d1, Dna *d2) { auto map = [](uint8_t a) -> int8_t { return a - 128; }; uint8_t *d1a = (uint8_t *)d1; uint8_t *d2a = (uint8_t *)d2; float mag1 = 0.0f; float mag2 = 0.0f; float dot_prod = 0.0f; for (size_t i = 0; i < sizeof(Dna); i++) { int8_t a = map(d1a[i]); int8_t b = map(d2a[i]); dot_prod += a * b; mag1 += a * a; mag2 += b * b; } mag1 = sqrt(mag1); mag2 = sqrt(mag2); return dot_prod / (mag1 * mag2); } float hamming_distance(Dna *d1, Dna *d2) { uint8_t *d1a = (uint8_t *)d1; uint8_t *d2a = (uint8_t *)d2; float distance = 0; for (size_t i = 0; i < sizeof(Dna); i++) { if (d1a[i] != d2a[i]) { distance++; } } return 1 - (distance / sizeof(Dna)); } float hamming_distance_without_seeds(Dna *d1, Dna *d2) { constexpr size_t start = sizeof(uint128) * 3; constexpr size_t end = sizeof(Dna); uint8_t *d1a = (uint8_t *)d1; uint8_t *d2a = (uint8_t *)d2; float distance = 0; for (size_t i = start; i < end; i++) { if (d1a[i] != d2a[i]) { distance++; } } return 1 - (distance / (end - start)); } const char *nameofFunc(simil_func f) { if (f == &Similarity::euclidean_distance) { return "eucl"; } else if (f == &Similarity::dot_minmax) { return "dot"; } else if (f == &Similarity::cosine_similarity) { return "cos"; } else if (f == &Similarity::cosine_similarity_int) { return "cos_i"; } else if (f == &Similarity::hamming_distance) { return "hamming_distance"; } else if (f == &Similarity::hamming_distance_without_seeds) { return "hamming_distance_without_seeds"; } else if (f == &Similarity::levenshtein_distance) { return "leven"; } else { return "unknown nameofFunc"; } } float calc_similarity(std::vector &vec, simil_func f) { auto start = std::chrono::high_resolution_clock::now(); size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2; float total_similarity = 0.0; for (size_t i = 0; i < vec.size(); i++) { for (size_t j = i + 1; j < vec.size(); j++) { total_similarity += f(&vec[i], &vec[j]); } } float average_similarity = total_similarity / num_pairs; auto stop = std::chrono::high_resolution_clock::now(); const auto int_ms = std::chrono::duration_cast(stop - start); return average_similarity * 100.0f; } float levenshtein_distance(Dna *d1, Dna *d2) { size_t len = sizeof(Dna); uint8_t *a = (uint8_t *)d1; uint8_t *b = (uint8_t *)d2; // Create a distance matrix static std::vector> dp(len + 1, std::vector(len + 1, 0)); // Initialize the first row and column for (size_t i = 0; i <= len; ++i) { dp[i][0] = i; } for (size_t j = 0; j <= len; ++j) { dp[0][j] = j; } // Fill the distance matrix for (size_t i = 1; i <= len; ++i) { for (size_t j = 1; j <= len; ++j) { uint32_t cost = (a[i - 1] == b[j - 1]) ? 0 : 1; dp[i][j] = std::min({ dp[i - 1][j] + 1, // deletion dp[i][j - 1] + 1, // insertion dp[i - 1][j - 1] + cost // substitution }); } } return 1 - (dp[len][len] / float(len + len)); } }