Source code for isogroup.base.io

import pandas as pd
from pathlib import Path


[docs] class IoHandler: """ Handles input and output operations. """ def __init__(self): self.dataset_path:Path = None self.dataset_name:str = None self.database_path:Path = None self.outputs_path:Path = None
[docs] def read_dataset(self, dataset): """ Reads the dataset from the specified file path and loads it into a pandas DataFrame. :param dataset: Path to the dataset file. """ self.dataset_path = dataset if not self.dataset_path.exists(): raise FileNotFoundError(f"File {self.dataset_path} does not exist.") self.dataset_name = self.dataset_path.stem return pd.read_csv(self.dataset_path, sep="\t")
# logging.info(f"Dataset loaded from {inputdata} with shape {data.shape}")
[docs] def read_database(self, database): """ Reads the database from the specified file path and loads it into a pandas DataFrame. :param database: Path to the database file. """ self.database_path = database if not self.database_path.exists(): raise FileNotFoundError(f"File {self.database_path} does not exist.") return pd.read_csv(self.database_path, sep=";")
[docs] def create_output_directory(self, outputs_path): """ Create an output directory for saving results. :param outputs_path: Path to the output directory. """ res_dir = Path(f"{outputs_path}/{self.dataset_name}_res") res_dir.mkdir(parents=True, exist_ok=True) self.outputs_path = res_dir
# logging.info(f"Results will be saved to: {self.outputs_path}")
[docs] def export_theoretical_database(self, database: pd.DataFrame): """ Summarize theoretical features into a DataFrame and export it to a tsv file. :param database: Database object containing theoretical features. """ # # Create a DataFrame to summarize the theoretical features # feature_data = { # "mz": [], # "rt": [], # "metabolite": [], # "isotopologue": [], # "formula": [] # } # for feature in database.theoretical_features: # feature_data["mz"].append(feature.mz) # feature_data["rt"].append(feature.rt) # feature_data["metabolite"].append(', '.join(feature.metabolite)) # # feature_data["isotopologue"].append(', '.join(map(str, feature.isotopologue))) # for metabolite in feature.metabolite: # feature_data["isotopologue"].append(feature.cluster_isotopologue[metabolite]) # feature_data["formula"].append(feature.formula) database.to_csv(Path(f"{self.outputs_path}/{self.dataset_name}.theoretical_db.tsv"), sep="\t", index=False)
[docs] def clusters_summary(self, clusters_to_summarize:dict): """ Export a tsv file with a summary of the clusters :param clusters_to_summarize: dict containing clusters to summarize :return: pd.DataFrame with the summary of the clusters """ # List to store the cluster summary data cluster_summary = [] cluster_id_unique = set() # To store unique cluster_id for _, clusters in clusters_to_summarize.items(): for cluster in clusters.values(): # Check if the cluster_id is unique if cluster.cluster_id not in cluster_id_unique: cluster_id_unique.add(cluster.cluster_id) summary = cluster.summary # Retrieve the samples in which the cluster is present samples_in_cluster = {sample for sample, clusters in clusters_to_summarize.items() if cluster.cluster_id in [c.summary["ClusterID"] for c in clusters.values()]} summary["number_of_samples"] = len(samples_in_cluster) cluster_summary.append(summary) # Create a DataFrame with the collected information df = pd.DataFrame(cluster_summary) # Export the DataFrame to a tsv file if a filename is provided # if filename: df.to_csv(f"{self.outputs_path}/{self.dataset_name}.summary.tsv", sep="\t", index=False)
# return df
[docs] def export_features(self, dataframe_to_export:pd.DataFrame): """ Export all features to a TSV file. :param features_to_export: dict containing features to export """ dataframe_to_export.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False)
[docs] def export_clusters(self, dataframe_to_export:pd.DataFrame): """ Convert the clusters into a pandas DataFrame for easier analysis and export (Untargeted case). :param cluster_to_export: dict containing clusters to export """ dataframe_to_export.to_csv(f"{self.outputs_path}/{self.dataset_name}.clusters.tsv", sep="\t", index=False)
# return pd.DataFrame.from_records(records) # def targ_export_features(self, features_to_export:dict, sample_name:str = None): # """ # Summarize annotated features into a DataFrame and export it to a tsv file. # :param features_to_export: dict containing features to export # :param sample_name: Name of the sample to filter the DataFrame by, if provided # """ # # Create a DataFrame to summarize the experimental features # feature_data = [] # for sample in features_to_export.values(): # for feature in sample.values(): # feature_data.append({ # "feature_id": feature.feature_id, # "mz": feature.mz, # "rt": feature.rt, # "metabolite": feature.metabolite, # # "isotopologue": feature.isotopologue, # "isotopologue": [feature.cluster_isotopologue[met] for met in feature.metabolite], # "mz_error": feature.mz_error, # "rt_error": feature.rt_error, # "sample": feature.sample, # "intensity": feature.intensity # }) # # Create a DataFrame to summarize the annotated data # df = pd.DataFrame(feature_data) # df.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False) # # Export the Dataframe of only one sample if a sample name is provided # if sample_name: # df = df[df["sample"] == sample_name] # Filter the DataFrame by sample name # df.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False) # return df # def targ_export_clusters(self, features:dict, clusters_to_export:dict, sample_name:str = None): # # Check if the sample name is in the DataFrame # all_samples = list(features.keys()) # if sample_name is not None: # if sample_name not in all_samples: # raise ValueError(f"Sample {sample_name} not found in annotated clusters. Available samples: {', '.join(all_samples)}") # cluster_data = [] # for sample, clusters in clusters_to_export.items(): # if sample_name is None or sample_name == sample: # Filter the DataFrame by sample name if provided # for cname, cluster in clusters.items(): # for feature in cluster.features: # idx = [i for i,j in enumerate(feature.metabolite) if j == cname][0] # # Get the cluster_id of the features in another cluster # other_clusters = [c.cluster_id for cluster_name, c in clusters.items() if feature in c.features and c.cluster_id != cluster.cluster_id] # cluster_data.append({ # "cluster_id": cluster.cluster_id, # "metabolite": cluster.name, # "feature_id": feature.feature_id, # "mz": feature.mz, # "rt": feature.rt, # "feature_potential_metabolite": feature.metabolite, # # "isotopologue": feature.isotopologue[idx], # "isotopologue": feature.cluster_isotopologue[cluster.name], # "mz_error": feature.mz_error[idx], # "rt_error": feature.rt_error[idx], # "sample": feature.sample, # "intensity": feature.intensity, # "status": cluster.status, # "missing_isotopologue": cluster.missing_isotopologues, # "duplicated_isotopologue": cluster.duplicated_isotopologues, # # "in_cluster": feature.in_cluster, # "in_another_cluster": other_clusters # }) # # Create a DataFrame to summarize the annotated clusters # df = pd.DataFrame(cluster_data) # # Export the DataFrame to a tsv file if a filename is provided # # if filename: # df.to_csv(f"{self.outputs_path}/{self.dataset_name}.clusters.tsv", sep="\t", index=False) # # return df # def export_clusters_to_tsv(self, filepath: str): # """ # Export the clusters to a CSV file. # :param filepath: str # """ # df = self.clusters_to_dataframe() # df.to_csv(filepath, sep="\t", index=False) # if __name__ == "__main__": # test = IoHandler( # ) # print(test.read_dataset(r"C:\Users\kouakou\Documents\IsoGroup_test\data\dataset_test_XCMS.txt")) # print(test.outputs_path) # print(test.tracer) # print(test._tracer_element) # test.export_annotated_features() # print(test.samples)