213 lines
4.9 KiB
C++
213 lines
4.9 KiB
C++
#include "values/Similarity.hpp"
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <raylib.h>
|
|
#include <chrono>
|
|
|
|
namespace Similarity
|
|
{
|
|
|
|
float dot_minmax(Dna *d1, Dna *d2)
|
|
{
|
|
uint64_t max = sizeof(Dna) * 255 * 255;
|
|
uint8_t *a = (uint8_t *)d1;
|
|
uint8_t *b = (uint8_t *)d2;
|
|
uint32_t result = 0;
|
|
for (size_t i = 0; i < sizeof(Dna); ++i)
|
|
{
|
|
result += static_cast<uint32_t>(a[i]) * static_cast<uint32_t>(b[i]);
|
|
}
|
|
|
|
return result / (double)max;
|
|
}
|
|
|
|
float euclidean_distance(Dna *d1, Dna *d2)
|
|
{
|
|
uint8_t *a = (uint8_t *)d1;
|
|
uint8_t *b = (uint8_t *)d2;
|
|
float sum = 0.0f;
|
|
for (size_t i = 0; i < sizeof(Dna); ++i)
|
|
{
|
|
float diff = static_cast<float>(a[i]) - static_cast<float>(b[i]);
|
|
sum += diff * diff;
|
|
}
|
|
|
|
float distance = std::sqrt(sum);
|
|
float max_distance = 255.0f * std::sqrt(static_cast<float>(sizeof(Dna)));
|
|
return 1 - (distance / max_distance);
|
|
}
|
|
|
|
// todo: use int8_t insted of uint8_t and map data
|
|
// 0 -> -128
|
|
// 255 -> 127
|
|
// int8_t = uint8_t - 128
|
|
float cosine_similarity(Dna *d1, Dna *d2)
|
|
{
|
|
uint8_t *d1a = (uint8_t *)d1;
|
|
uint8_t *d2a = (uint8_t *)d2;
|
|
|
|
float mag1 = 0.0f;
|
|
float mag2 = 0.0f;
|
|
float dot_prod = 0.0f;
|
|
for (size_t i = 0; i < sizeof(Dna); i++)
|
|
{
|
|
dot_prod += d1a[i] * d2a[i];
|
|
mag1 += d1a[i] * d1a[i];
|
|
mag2 += d2a[i] * d2a[i];
|
|
}
|
|
mag1 = sqrt(mag1);
|
|
mag2 = sqrt(mag2);
|
|
|
|
return dot_prod / (mag1 * mag2);
|
|
}
|
|
|
|
float cosine_similarity_int(Dna *d1, Dna *d2)
|
|
{
|
|
auto map = [](uint8_t a) -> int8_t
|
|
{ return a - 128; };
|
|
uint8_t *d1a = (uint8_t *)d1;
|
|
uint8_t *d2a = (uint8_t *)d2;
|
|
float mag1 = 0.0f;
|
|
float mag2 = 0.0f;
|
|
float dot_prod = 0.0f;
|
|
for (size_t i = 0; i < sizeof(Dna); i++)
|
|
{
|
|
int8_t a = map(d1a[i]);
|
|
int8_t b = map(d2a[i]);
|
|
dot_prod += a * b;
|
|
mag1 += a * a;
|
|
mag2 += b * b;
|
|
}
|
|
mag1 = sqrt(mag1);
|
|
mag2 = sqrt(mag2);
|
|
return dot_prod / (mag1 * mag2);
|
|
}
|
|
|
|
float hamming_distance(Dna *d1, Dna *d2)
|
|
{
|
|
uint8_t *d1a = (uint8_t *)d1;
|
|
uint8_t *d2a = (uint8_t *)d2;
|
|
float distance = 0;
|
|
for (size_t i = 0; i < sizeof(Dna); i++)
|
|
{
|
|
if (d1a[i] != d2a[i])
|
|
{
|
|
distance++;
|
|
}
|
|
}
|
|
return 1 - (distance / sizeof(Dna));
|
|
}
|
|
|
|
float hamming_distance_without_seeds(Dna *d1, Dna *d2)
|
|
{
|
|
constexpr size_t start = sizeof(uint128) * 3;
|
|
constexpr size_t end = sizeof(Dna);
|
|
uint8_t *d1a = (uint8_t *)d1;
|
|
uint8_t *d2a = (uint8_t *)d2;
|
|
float distance = 0;
|
|
for (size_t i = start; i < end; i++)
|
|
{
|
|
if (d1a[i] != d2a[i])
|
|
{
|
|
distance++;
|
|
}
|
|
}
|
|
return 1 - (distance / (end - start));
|
|
}
|
|
|
|
const char *nameofFunc(simil_func f)
|
|
{
|
|
if (f == &Similarity::euclidean_distance)
|
|
{
|
|
return "eucl";
|
|
}
|
|
else if (f == &Similarity::dot_minmax)
|
|
{
|
|
return "dot";
|
|
}
|
|
else if (f == &Similarity::cosine_similarity)
|
|
{
|
|
return "cos";
|
|
}
|
|
else if (f == &Similarity::cosine_similarity_int)
|
|
{
|
|
return "cos_i";
|
|
}
|
|
else if (f == &Similarity::hamming_distance)
|
|
{
|
|
return "hamming_distance";
|
|
}
|
|
else if (f == &Similarity::hamming_distance_without_seeds)
|
|
{
|
|
return "hamming_distance_without_seeds";
|
|
}
|
|
else if (f == &Similarity::levenshtein_distance)
|
|
{
|
|
return "leven";
|
|
}
|
|
else
|
|
{
|
|
return "unknown nameofFunc";
|
|
}
|
|
}
|
|
|
|
float calc_similarity(std::vector<Dna> &vec, simil_func f)
|
|
{
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
size_t num_pairs = (vec.size() * (vec.size() - 1)) / 2;
|
|
|
|
float total_similarity = 0.0;
|
|
for (size_t i = 0; i < vec.size(); i++)
|
|
{
|
|
for (size_t j = i + 1; j < vec.size(); j++)
|
|
{
|
|
total_similarity += f(&vec[i], &vec[j]);
|
|
}
|
|
}
|
|
float average_similarity = total_similarity / num_pairs;
|
|
|
|
auto stop = std::chrono::high_resolution_clock::now();
|
|
|
|
const auto int_ms = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
|
|
|
|
return average_similarity * 100.0f;
|
|
}
|
|
|
|
float levenshtein_distance(Dna *d1, Dna *d2)
|
|
{
|
|
size_t len = sizeof(Dna);
|
|
uint8_t *a = (uint8_t *)d1;
|
|
uint8_t *b = (uint8_t *)d2;
|
|
|
|
// Create a distance matrix
|
|
static std::vector<std::vector<uint32_t>> dp(len + 1, std::vector<uint32_t>(len + 1, 0));
|
|
|
|
// Initialize the first row and column
|
|
for (size_t i = 0; i <= len; ++i)
|
|
{
|
|
dp[i][0] = i;
|
|
}
|
|
for (size_t j = 0; j <= len; ++j)
|
|
{
|
|
dp[0][j] = j;
|
|
}
|
|
|
|
// Fill the distance matrix
|
|
for (size_t i = 1; i <= len; ++i)
|
|
{
|
|
for (size_t j = 1; j <= len; ++j)
|
|
{
|
|
uint32_t cost = (a[i - 1] == b[j - 1]) ? 0 : 1;
|
|
dp[i][j] = std::min({
|
|
dp[i - 1][j] + 1, // deletion
|
|
dp[i][j - 1] + 1, // insertion
|
|
dp[i - 1][j - 1] + cost // substitution
|
|
});
|
|
}
|
|
}
|
|
return 1 - (dp[len][len] / float(len + len));
|
|
}
|
|
|
|
}
|