From c3236b6189bc005c501d7eab8aa75221dab6be8b Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Wed, 15 Jan 2025 17:04:35 -0500 Subject: [PATCH 1/5] Added MMD --- openpmcvl/statistical_analysis/MMD.py | 104 ++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 openpmcvl/statistical_analysis/MMD.py diff --git a/openpmcvl/statistical_analysis/MMD.py b/openpmcvl/statistical_analysis/MMD.py new file mode 100644 index 0000000..b728d31 --- /dev/null +++ b/openpmcvl/statistical_analysis/MMD.py @@ -0,0 +1,104 @@ +import argparse +import torch +import os + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +""" + From https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html#formal-definition +""" +def MMD(x, y, kernel): + """Emprical maximum mean discrepancy. The lower the result + the more evidence that distributions are the same. + + Args: + x: first sample, distribution P + y: second sample, distribution Q + kernel: kernel type such as "multiscale" or "rbf" + """ + xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t()) + rx = (xx.diag().unsqueeze(0).expand_as(xx)) + ry = (yy.diag().unsqueeze(0).expand_as(yy)) + + dxx = rx.t() + rx - 2. * xx # Used for A in (1) + dyy = ry.t() + ry - 2. * yy # Used for B in (1) + dxy = rx.t() + ry - 2. * zz # Used for C in (1) + + XX, YY, XY = (torch.zeros(xx.shape).to(device), + torch.zeros(xx.shape).to(device), + torch.zeros(xx.shape).to(device)) + + if kernel == "multiscale": + + bandwidth_range = [0.2, 0.5, 0.9, 1.3] + for a in bandwidth_range: + XX += a**2 * (a**2 + dxx)**-1 + YY += a**2 * (a**2 + dyy)**-1 + XY += a**2 * (a**2 + dxy)**-1 + + if kernel == "rbf": + + bandwidth_range = [10, 15, 20, 50] + for a in bandwidth_range: + XX += torch.exp(-0.5*dxx/a) + YY += torch.exp(-0.5*dyy/a) + XY += torch.exp(-0.5*dxy/a) + + return torch.mean(XX + YY - 2. * XY) + + + +def load_tensors_to_matrix(directory_path): + """ + Loads all .pt files in a directory and combines them into a matrix. + + Args: + directory_path (str): Path to the directory containing .pt files. + + Returns: + torch.Tensor: A matrix where each row corresponds to a tensor from a .pt file. + """ + tensors = [] + + try: + for file_name in sorted(os.listdir(directory_path)): + if file_name.endswith(".pt"): + file_path = os.path.join(directory_path, file_name) + tensor = torch.load(file_path) + + # Ensure tensor is 1D + if tensor.ndim == 1: + tensors.append(tensor) + else: + raise ValueError(f"Tensor in {file_path} is not 1D: {tensor.shape}") + + # Combine tensors into a matrix + if tensors: + return torch.stack(tensors) + else: + raise RuntimeError(f"No .pt files found in {directory_path}") + + except Exception as e: + raise RuntimeError(f"Failed to load tensors from {directory_path}: {e}") + + +# Define your functions here, e.g., load_tensors_to_matrix and MMD + +def main(): + parser = argparse.ArgumentParser(description="Compute MMD between two tensor directories.") + parser.add_argument("path1", type=str, help="Path to the first directory containing .pt files.") + parser.add_argument("path2", type=str, help="Path to the second directory containing .pt files.") + parser.add_argument("--kernel", type=str, default="rbf", help="Kernel to use for MMD computation (default: rbf).") + args = parser.parse_args() + + biomedclip_representations = load_tensors_to_matrix(args.path1) + pmcoa2_intext_representations = load_tensors_to_matrix(args.path2) + + biomedclip_representations = biomedclip_representations.to(device) + pmcoa2_intext_representations = pmcoa2_intext_representations.to(device) + + result = MMD(biomedclip_representations, pmcoa2_intext_representations, kernel=args.kernel) + print(f"MMD value (kernel='{args.kernel}') between tensors in '{args.path1}' and '{args.path2}': {result:.6f}") + +if __name__ == "__main__": + main() From a07dbedecab8e99e64726b76e1fe995ccccf1eb2 Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 16 Jan 2025 12:37:45 -0500 Subject: [PATCH 2/5] Updated MMD --- openpmcvl/statistical_analysis/MMD.py | 180 +++++++++++++++++--------- 1 file changed, 121 insertions(+), 59 deletions(-) diff --git a/openpmcvl/statistical_analysis/MMD.py b/openpmcvl/statistical_analysis/MMD.py index b728d31..6e2abfd 100644 --- a/openpmcvl/statistical_analysis/MMD.py +++ b/openpmcvl/statistical_analysis/MMD.py @@ -1,12 +1,35 @@ import argparse + import torch -import os + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") """ From https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html#formal-definition """ + + +def load_tensors_to_matrix(file_path): + """ + Loads a single .pt file containing a matrix. + + Args: + file_path (str): Path to the .pt file containing the matrix. + + Returns + ------- + torch.Tensor: The loaded matrix. + """ + try: + matrix = torch.load(file_path, weights_only=True) + if matrix.ndim == 2: + return matrix + raise ValueError(f"Loaded tensor is not a 2D matrix: {matrix.shape}") + except Exception as e: + raise RuntimeError(f"Failed to load tensor matrix from {file_path}: {e}") + + def MMD(x, y, kernel): """Emprical maximum mean discrepancy. The lower the result the more evidence that distributions are the same. @@ -17,88 +40,127 @@ def MMD(x, y, kernel): kernel: kernel type such as "multiscale" or "rbf" """ xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t()) - rx = (xx.diag().unsqueeze(0).expand_as(xx)) - ry = (yy.diag().unsqueeze(0).expand_as(yy)) + rx = xx.diag().unsqueeze(0).expand_as(xx) + ry = yy.diag().unsqueeze(0).expand_as(yy) - dxx = rx.t() + rx - 2. * xx # Used for A in (1) - dyy = ry.t() + ry - 2. * yy # Used for B in (1) - dxy = rx.t() + ry - 2. * zz # Used for C in (1) + dxx = rx.t() + rx - 2.0 * xx # Used for A in (1) + dyy = ry.t() + ry - 2.0 * yy # Used for B in (1) + dxy = rx.t() + ry - 2.0 * zz # Used for C in (1) - XX, YY, XY = (torch.zeros(xx.shape).to(device), - torch.zeros(xx.shape).to(device), - torch.zeros(xx.shape).to(device)) + XX, YY, XY = ( + torch.zeros(xx.shape).to(device), + torch.zeros(xx.shape).to(device), + torch.zeros(xx.shape).to(device), + ) if kernel == "multiscale": - bandwidth_range = [0.2, 0.5, 0.9, 1.3] for a in bandwidth_range: - XX += a**2 * (a**2 + dxx)**-1 - YY += a**2 * (a**2 + dyy)**-1 - XY += a**2 * (a**2 + dxy)**-1 + XX += a**2 * (a**2 + dxx) ** -1 + YY += a**2 * (a**2 + dyy) ** -1 + XY += a**2 * (a**2 + dxy) ** -1 if kernel == "rbf": - bandwidth_range = [10, 15, 20, 50] for a in bandwidth_range: - XX += torch.exp(-0.5*dxx/a) - YY += torch.exp(-0.5*dyy/a) - XY += torch.exp(-0.5*dxy/a) - - return torch.mean(XX + YY - 2. * XY) + XX += torch.exp(-0.5 * dxx / a) + YY += torch.exp(-0.5 * dyy / a) + XY += torch.exp(-0.5 * dxy / a) + return torch.mean(XX + YY - 2.0 * XY) -def load_tensors_to_matrix(directory_path): +def combine_and_shuffle(A, B): """ - Loads all .pt files in a directory and combines them into a matrix. - + Combine two matrices A and B into a single dataset, shuffle, and split. + Args: - directory_path (str): Path to the directory containing .pt files. - - Returns: - torch.Tensor: A matrix where each row corresponds to a tensor from a .pt file. + A (torch.Tensor): First dataset of size (m, D). + B (torch.Tensor): Second dataset of size (n, D). + + Returns + ------- + A_prime (torch.Tensor): Shuffled subset of size (m, D) from the combined dataset. + B_prime (torch.Tensor): Shuffled subset of size (n, D) from the combined dataset. """ - tensors = [] - - try: - for file_name in sorted(os.listdir(directory_path)): - if file_name.endswith(".pt"): - file_path = os.path.join(directory_path, file_name) - tensor = torch.load(file_path) - - # Ensure tensor is 1D - if tensor.ndim == 1: - tensors.append(tensor) - else: - raise ValueError(f"Tensor in {file_path} is not 1D: {tensor.shape}") - - # Combine tensors into a matrix - if tensors: - return torch.stack(tensors) - else: - raise RuntimeError(f"No .pt files found in {directory_path}") - - except Exception as e: - raise RuntimeError(f"Failed to load tensors from {directory_path}: {e}") + # Combine A and B into C + C = torch.cat((A, B), dim=0) + # Shuffle C randomly + indices = torch.randperm(C.size(0)) + C_shuffled = C[indices] + + # Split C into A' and B' + m, n = A.size(0), B.size(0) + A_prime = C_shuffled[:m] + B_prime = C_shuffled[m : m + n] + + return A_prime, B_prime + + +def compute_p_value(null_distribution, observed_mmd2): + """ + Computes the p-value as the proportion of null distribution values + greater than or equal to the observed MMD^2 statistic. + + Args: + null_distribution (list or array): List of MMD^2 values from the null distribution. + observed_mmd2 (float): Observed MMD^2 statistic. + + Returns + ------- + float: The computed p-value. + """ + count_greater_equal = sum( + 1 for value in null_distribution if value >= observed_mmd2 + ) + p_value = count_greater_equal / len(null_distribution) + return p_value -# Define your functions here, e.g., load_tensors_to_matrix and MMD def main(): - parser = argparse.ArgumentParser(description="Compute MMD between two tensor directories.") - parser.add_argument("path1", type=str, help="Path to the first directory containing .pt files.") - parser.add_argument("path2", type=str, help="Path to the second directory containing .pt files.") - parser.add_argument("--kernel", type=str, default="rbf", help="Kernel to use for MMD computation (default: rbf).") + parser = argparse.ArgumentParser( + description="Compute MMD between two tensor directories." + ) + parser.add_argument( + "path1", type=str, help="Path to the first directory containing .pt files." + ) + parser.add_argument( + "path2", type=str, help="Path to the second directory containing .pt files." + ) + parser.add_argument( + "--kernel", + type=str, + default="rbf", + help="Kernel to use for MMD computation (default: rbf).", + ) + parser.add_argument( + "--n", type=int, default=10, help="Number of permuting (default: 10)." + ) args = parser.parse_args() - biomedclip_representations = load_tensors_to_matrix(args.path1) - pmcoa2_intext_representations = load_tensors_to_matrix(args.path2) + first_representations = load_tensors_to_matrix(args.path1).to(device) + second_representations = load_tensors_to_matrix(args.path2).to(device) + + MMD_obs = MMD(first_representations, second_representations, kernel=args.kernel) + print(f"Observation MMD value: {MMD_obs:.6f}") + MMD_perms = [] + + for i in range(args.n): + first_representations_prime, second_representations_prime = combine_and_shuffle( + first_representations, second_representations + ) + result = MMD( + first_representations_prime, + second_representations_prime, + kernel=args.kernel, + ) + MMD_perms.append(result) + print(f"{i}. Permuted MMD value: {result:.6f}") - biomedclip_representations = biomedclip_representations.to(device) - pmcoa2_intext_representations = pmcoa2_intext_representations.to(device) + p_value = compute_p_value(MMD_perms, MMD_obs) + print(f"P-value: {p_value}") - result = MMD(biomedclip_representations, pmcoa2_intext_representations, kernel=args.kernel) - print(f"MMD value (kernel='{args.kernel}') between tensors in '{args.path1}' and '{args.path2}': {result:.6f}") if __name__ == "__main__": main() From 6cb7e79f271d0aed0e020b7604291d37a70f3abf Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 16 Jan 2025 17:13:43 -0500 Subject: [PATCH 3/5] Added Sampling --- openpmcvl/statistical_analysis/MMD.py | 225 +++++++++++++++++++++++--- 1 file changed, 205 insertions(+), 20 deletions(-) diff --git a/openpmcvl/statistical_analysis/MMD.py b/openpmcvl/statistical_analysis/MMD.py index 6e2abfd..5bb2cae 100644 --- a/openpmcvl/statistical_analysis/MMD.py +++ b/openpmcvl/statistical_analysis/MMD.py @@ -1,5 +1,6 @@ import argparse +import numpy as np import torch @@ -30,7 +31,7 @@ def load_tensors_to_matrix(file_path): raise RuntimeError(f"Failed to load tensor matrix from {file_path}: {e}") -def MMD(x, y, kernel): +def MMD(x, y, kernel, bandwidth): """Emprical maximum mean discrepancy. The lower the result the more evidence that distributions are the same. @@ -54,18 +55,14 @@ def MMD(x, y, kernel): ) if kernel == "multiscale": - bandwidth_range = [0.2, 0.5, 0.9, 1.3] - for a in bandwidth_range: - XX += a**2 * (a**2 + dxx) ** -1 - YY += a**2 * (a**2 + dyy) ** -1 - XY += a**2 * (a**2 + dxy) ** -1 - - if kernel == "rbf": - bandwidth_range = [10, 15, 20, 50] - for a in bandwidth_range: - XX += torch.exp(-0.5 * dxx / a) - YY += torch.exp(-0.5 * dyy / a) - XY += torch.exp(-0.5 * dxy / a) + XX += bandwidth**2 * (bandwidth**2 + dxx) ** -1 + YY += bandwidth**2 * (bandwidth**2 + dyy) ** -1 + XY += bandwidth**2 * (bandwidth**2 + dxy) ** -1 + + if kernel == "rbf": # Radial Basis Function (RBF) kernel + XX += torch.exp(-0.5 * dxx / bandwidth) + YY += torch.exp(-0.5 * dyy / bandwidth) + XY += torch.exp(-0.5 * dxy / bandwidth) return torch.mean(XX + YY - 2.0 * XY) @@ -118,15 +115,131 @@ def compute_p_value(null_distribution, observed_mmd2): return p_value +def create_subsamples_from_one_matrix(matrix, subsample_size, percentage): + """ + Creates two subsamples from a matrix with a specified percentage of similar data points + and converts them to PyTorch tensors. + + Args: + matrix (np.ndarray): Input matrix where each row is a data point. + subsample_size (int): Number of data points in each subsample. + percentage (float): Percentage (0-100) of similar data points between the two subsamples. + + Returns + ------- + tuple: Two subsamples as PyTorch tensors. + """ + if not (0 <= percentage <= 100): + raise ValueError("Percentage must be between 0 and 100.") + if subsample_size > len(matrix): + raise ValueError( + "Subsample size cannot be larger than the number of data points in the matrix." + ) + + np.random.shuffle(matrix) + + num_shared_points = int((percentage / 100) * subsample_size) + + num_unique_points = subsample_size - num_shared_points + + shared_indices = np.random.choice(len(matrix), num_shared_points, replace=False) + shared_points = matrix[shared_indices] + + remaining_indices = list(set(range(len(matrix))) - set(shared_indices)) + unique_indices_1 = np.random.choice( + remaining_indices, num_unique_points, replace=False + ) + unique_points_1 = matrix[unique_indices_1] + + remaining_indices = list(set(remaining_indices) - set(unique_indices_1)) + unique_indices_2 = np.random.choice( + remaining_indices, num_unique_points, replace=False + ) + unique_points_2 = matrix[unique_indices_2] + + subsample_1 = np.vstack((shared_points, unique_points_1)) + subsample_2 = np.vstack((shared_points, unique_points_2)) + + tensor_1 = torch.tensor(subsample_1, dtype=torch.float32) + tensor_2 = torch.tensor(subsample_2, dtype=torch.float32) + + return tensor_1.to(device), tensor_2.to(device) + + +def create_subsamples_from_two_matrices(matrix1, matrix2, subsample_size, percentage): + """ + Creates two subsamples from two given matrices with a specified percentage of similar data points + and converts them to PyTorch tensors. + + Args: + matrix1 (np.ndarray): First input matrix where each row is a data point. + matrix2 (np.ndarray): Second input matrix where each row is a data point. + subsample_size (int): Number of data points in each subsample. + percentage (float): Percentage (0-100) of similar data points between the two subsamples. + + Returns + ------- + tuple: Two subsamples as PyTorch tensors. + """ + if not (0 <= percentage <= 100): + raise ValueError("Percentage must be between 0 and 100.") + if subsample_size > len(matrix1) or subsample_size > len(matrix2): + raise ValueError( + "Subsample size cannot be larger than the number of data points in either matrix." + ) + np.random.shuffle(matrix1) + np.random.shuffle(matrix2) + + num_shared_points = int((percentage / 100) * subsample_size) + + num_unique_points = subsample_size - num_shared_points + + shared_indices_matrix1 = np.random.choice( + len(matrix1), num_shared_points, replace=False + ) + shared_points_matrix1 = matrix1[shared_indices_matrix1] + + shared_indices_matrix2 = np.random.choice( + len(matrix2), num_shared_points, replace=False + ) + shared_points_matrix2 = matrix2[shared_indices_matrix2] + + remaining_indices_matrix1 = list( + set(range(len(matrix1))) - set(shared_indices_matrix1) + ) + unique_indices_matrix1 = np.random.choice( + remaining_indices_matrix1, num_unique_points, replace=False + ) + unique_points_matrix1 = matrix1[unique_indices_matrix1] + + remaining_indices_matrix2 = list( + set(range(len(matrix2))) - set(shared_indices_matrix2) + ) + unique_indices_matrix2 = np.random.choice( + remaining_indices_matrix2, num_unique_points, replace=False + ) + unique_points_matrix2 = matrix2[unique_indices_matrix2] + + subsample1 = np.vstack((shared_points_matrix1, unique_points_matrix1)) + subsample2 = np.vstack((shared_points_matrix2, unique_points_matrix2)) + + tensor1 = torch.tensor(subsample1, dtype=torch.float32) + tensor2 = torch.tensor(subsample2, dtype=torch.float32) + + return tensor1.to(device), tensor2.to(device) + + def main(): parser = argparse.ArgumentParser( description="Compute MMD between two tensor directories." ) parser.add_argument( - "path1", type=str, help="Path to the first directory containing .pt files." + "path1", + type=str, + help="Path to the first directory containing .pt files.", ) parser.add_argument( - "path2", type=str, help="Path to the second directory containing .pt files." + "--path2", type=str, help="Path to the second directory containing .pt files." ) parser.add_argument( "--kernel", @@ -135,14 +248,60 @@ def main(): help="Kernel to use for MMD computation (default: rbf).", ) parser.add_argument( - "--n", type=int, default=10, help="Number of permuting (default: 10)." + "--n", type=int, default=100, help="Number of permuting (default: 10)." + ) + parser.add_argument( + "--bandwidth", + type=int, + default=10, + help="Number of bandwidth for kernel (default: 10).", + ) + parser.add_argument( + "--sampling_type", + type=str, + choices=["one_matrix", "two_matrices"], + help="Type of sampling: 'one_matrix' for a single matrix or 'two_matrices' for two matrices.", + ) + parser.add_argument( + "--subsample_size", + type=int, + default=10, + help="Number of data points in each subsample (default: 10).", + ) + parser.add_argument( + "--percentage", + type=float, + default=50.0, + help="Percentage of similar data points between subsamples (default: 50).", ) args = parser.parse_args() - first_representations = load_tensors_to_matrix(args.path1).to(device) - second_representations = load_tensors_to_matrix(args.path2).to(device) + first_representations = load_tensors_to_matrix(args.path1) - MMD_obs = MMD(first_representations, second_representations, kernel=args.kernel) + if args.sampling_type == "one_matrix": + first_representations, second_representations = create_subsamples_from_one_matrix( + first_representations, args.subsample_size, args.percentage + ) + elif args.sampling_type == "two_matrices": + second_representations = load_tensors_to_matrix(args.path2) + first_representations, second_representations = ( + create_subsamples_from_two_matrices( + first_representations, + second_representations, + args.subsample_size, + args.percentage, + ) + ) + elif args.sampling_type is None: + first_representations = first_representations.to(device) + second_representations = load_tensors_to_matrix(args.path2).to(device) + + MMD_obs = MMD( + first_representations, + second_representations, + kernel=args.kernel, + bandwidth=args.bandwidth, + ) print(f"Observation MMD value: {MMD_obs:.6f}") MMD_perms = [] @@ -154,12 +313,38 @@ def main(): first_representations_prime, second_representations_prime, kernel=args.kernel, + bandwidth=args.bandwidth, ) MMD_perms.append(result) print(f"{i}. Permuted MMD value: {result:.6f}") p_value = compute_p_value(MMD_perms, MMD_obs) - print(f"P-value: {p_value}") + print(f"P-value: {p_value} \n") + + MMD_perms = torch.stack(MMD_perms) + overall_min = MMD_perms.min() + overall_max = MMD_perms.max() + + print(f"Observation MMD value: {MMD_obs} \n") + + print(f"Overall Min: {overall_min}") + print(f"Overall Max: {overall_max}") + + # Calculate the 95% percentile range + lower_percentile = torch.quantile(MMD_perms, 0.025) # 2.5% percentile + upper_percentile = torch.quantile(MMD_perms, 0.975) # 97.5% percentile + + # Filter values within the 95% range + middle_values = MMD_perms[ + (MMD_perms >= lower_percentile) & (MMD_perms <= upper_percentile) + ] + + # Find the min and max of the middle 95% + middle_min = middle_values.min() + middle_max = middle_values.max() + + print(f"Middle 95% Min: {middle_min}") + print(f"Middle 95% Max: {middle_max}") if __name__ == "__main__": From 8fb067dc4c03216063c2b8ada342d42d23e03acf Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Thu, 30 Jan 2025 10:53:24 -0500 Subject: [PATCH 4/5] Added PCA --- .gitignore | 10 ++++++++++ openpmcvl/statistical_analysis/MMD.py | 14 ++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/.gitignore b/.gitignore index 5d47251..aecb53e 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,13 @@ outputs/ *.jpg *.png *.git +*.mov +*.avi +*.pkl +*.mpeg +*.dcr +*.jsonl + + +env/ +myenv/ \ No newline at end of file diff --git a/openpmcvl/statistical_analysis/MMD.py b/openpmcvl/statistical_analysis/MMD.py index 5bb2cae..58ceed2 100644 --- a/openpmcvl/statistical_analysis/MMD.py +++ b/openpmcvl/statistical_analysis/MMD.py @@ -2,6 +2,7 @@ import numpy as np import torch +from sklearn.decomposition import PCA device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -292,6 +293,15 @@ def main(): args.percentage, ) ) + if first_representations.shape[1] != second_representations.shape[1]: + if first_representations.shape[1] > second_representations.shape[1]: + pca = PCA(n_components=second_representations.shape[1]) + first_representations_reduced = pca.fit_transform(first_representations.cpu().numpy()) + first_representations = torch.tensor(first_representations_reduced, dtype=torch.float32, device=first_representations.device) + else: + pca = PCA(n_components=first_representations.shape[1]) + second_representations_reduced = pca.fit_transform(second_representations.cpu().numpy()) + second_representations = torch.tensor(second_representations_reduced, dtype=torch.float32, device=second_representations.device) elif args.sampling_type is None: first_representations = first_representations.to(device) second_representations = load_tensors_to_matrix(args.path2).to(device) @@ -342,6 +352,10 @@ def main(): # Find the min and max of the middle 95% middle_min = middle_values.min() middle_max = middle_values.max() + + + torch.set_printoptions(precision=10) + print(MMD_perms) print(f"Middle 95% Min: {middle_min}") print(f"Middle 95% Max: {middle_max}") From 3ddc8a7edc2898e57c3b0f8977ba12501e32ad3a Mon Sep 17 00:00:00 2001 From: Negiiiin Date: Fri, 31 Jan 2025 20:52:10 -0500 Subject: [PATCH 5/5] Added PCA for Both --- openpmcvl/statistical_analysis/MMD.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/openpmcvl/statistical_analysis/MMD.py b/openpmcvl/statistical_analysis/MMD.py index 58ceed2..e8e9da1 100644 --- a/openpmcvl/statistical_analysis/MMD.py +++ b/openpmcvl/statistical_analysis/MMD.py @@ -298,10 +298,16 @@ def main(): pca = PCA(n_components=second_representations.shape[1]) first_representations_reduced = pca.fit_transform(first_representations.cpu().numpy()) first_representations = torch.tensor(first_representations_reduced, dtype=torch.float32, device=first_representations.device) + pca = PCA(n_components=second_representations.shape[1]) + second_representations_reduced = pca.fit_transform(second_representations.cpu().numpy()) + second_representations = torch.tensor(second_representations_reduced, dtype=torch.float32, device=second_representations.device) else: pca = PCA(n_components=first_representations.shape[1]) second_representations_reduced = pca.fit_transform(second_representations.cpu().numpy()) second_representations = torch.tensor(second_representations_reduced, dtype=torch.float32, device=second_representations.device) + pca = PCA(n_components=first_representations.shape[1]) + first_representations_reduced = pca.fit_transform(first_representations.cpu().numpy()) + first_representations = torch.tensor(first_representations_reduced, dtype=torch.float32, device=first_representations.device) elif args.sampling_type is None: first_representations = first_representations.to(device) second_representations = load_tensors_to_matrix(args.path2).to(device)