import pandas as pd
from pathlib import Path
[docs]
class IoHandler:
"""
Handles input and output operations.
"""
def __init__(self):
self.dataset_path:Path = None
self.dataset_name:str = None
self.database_path:Path = None
self.outputs_path:Path = None
[docs]
def read_dataset(self, dataset):
"""
Reads the dataset from the specified file path and loads it into a pandas DataFrame.
:param dataset: Path to the dataset file.
"""
self.dataset_path = dataset
if not self.dataset_path.exists():
raise FileNotFoundError(f"File {self.dataset_path} does not exist.")
self.dataset_name = self.dataset_path.stem
return pd.read_csv(self.dataset_path, sep="\t")
# logging.info(f"Dataset loaded from {inputdata} with shape {data.shape}")
[docs]
def read_database(self, database):
"""
Reads the database from the specified file path and loads it into a pandas DataFrame.
:param database: Path to the database file.
"""
self.database_path = database
if not self.database_path.exists():
raise FileNotFoundError(f"File {self.database_path} does not exist.")
return pd.read_csv(self.database_path, sep=";")
[docs]
def create_output_directory(self, outputs_path):
"""
Create an output directory for saving results.
:param outputs_path: Path to the output directory.
"""
res_dir = Path(f"{outputs_path}/{self.dataset_name}_res")
res_dir.mkdir(parents=True, exist_ok=True)
self.outputs_path = res_dir
# logging.info(f"Results will be saved to: {self.outputs_path}")
[docs]
def export_theoretical_database(self, database: pd.DataFrame):
"""
Summarize theoretical features into a DataFrame and export it to a tsv file.
:param database: Database object containing theoretical features.
"""
# # Create a DataFrame to summarize the theoretical features
# feature_data = {
# "mz": [],
# "rt": [],
# "metabolite": [],
# "isotopologue": [],
# "formula": []
# }
# for feature in database.theoretical_features:
# feature_data["mz"].append(feature.mz)
# feature_data["rt"].append(feature.rt)
# feature_data["metabolite"].append(', '.join(feature.metabolite))
# # feature_data["isotopologue"].append(', '.join(map(str, feature.isotopologue)))
# for metabolite in feature.metabolite:
# feature_data["isotopologue"].append(feature.cluster_isotopologue[metabolite])
# feature_data["formula"].append(feature.formula)
database.to_csv(Path(f"{self.outputs_path}/{self.dataset_name}.theoretical_db.tsv"),
sep="\t",
index=False)
[docs]
def clusters_summary(self, clusters_to_summarize:dict):
"""
Export a tsv file with a summary of the clusters
:param clusters_to_summarize: dict containing clusters to summarize
:return: pd.DataFrame with the summary of the clusters
"""
# List to store the cluster summary data
cluster_summary = []
cluster_id_unique = set() # To store unique cluster_id
for _, clusters in clusters_to_summarize.items():
for cluster in clusters.values():
# Check if the cluster_id is unique
if cluster.cluster_id not in cluster_id_unique:
cluster_id_unique.add(cluster.cluster_id)
summary = cluster.summary
# Retrieve the samples in which the cluster is present
samples_in_cluster = {sample for sample, clusters in clusters_to_summarize.items() if cluster.cluster_id in [c.summary["ClusterID"] for c in clusters.values()]}
summary["number_of_samples"] = len(samples_in_cluster)
cluster_summary.append(summary)
# Create a DataFrame with the collected information
df = pd.DataFrame(cluster_summary)
# Export the DataFrame to a tsv file if a filename is provided
# if filename:
df.to_csv(f"{self.outputs_path}/{self.dataset_name}.summary.tsv", sep="\t", index=False)
# return df
[docs]
def export_features(self, dataframe_to_export:pd.DataFrame):
"""
Export all features to a TSV file.
:param features_to_export: dict containing features to export
"""
dataframe_to_export.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False)
[docs]
def export_clusters(self, dataframe_to_export:pd.DataFrame):
"""
Convert the clusters into a pandas DataFrame for easier analysis and export (Untargeted case).
:param cluster_to_export: dict containing clusters to export
"""
dataframe_to_export.to_csv(f"{self.outputs_path}/{self.dataset_name}.clusters.tsv", sep="\t", index=False)
# return pd.DataFrame.from_records(records)
# def targ_export_features(self, features_to_export:dict, sample_name:str = None):
# """
# Summarize annotated features into a DataFrame and export it to a tsv file.
# :param features_to_export: dict containing features to export
# :param sample_name: Name of the sample to filter the DataFrame by, if provided
# """
# # Create a DataFrame to summarize the experimental features
# feature_data = []
# for sample in features_to_export.values():
# for feature in sample.values():
# feature_data.append({
# "feature_id": feature.feature_id,
# "mz": feature.mz,
# "rt": feature.rt,
# "metabolite": feature.metabolite,
# # "isotopologue": feature.isotopologue,
# "isotopologue": [feature.cluster_isotopologue[met] for met in feature.metabolite],
# "mz_error": feature.mz_error,
# "rt_error": feature.rt_error,
# "sample": feature.sample,
# "intensity": feature.intensity
# })
# # Create a DataFrame to summarize the annotated data
# df = pd.DataFrame(feature_data)
# df.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False)
# # Export the Dataframe of only one sample if a sample name is provided
# if sample_name:
# df = df[df["sample"] == sample_name] # Filter the DataFrame by sample name
# df.to_csv(f"{self.outputs_path}/{self.dataset_name}.features.tsv", sep="\t", index=False)
# return df
# def targ_export_clusters(self, features:dict, clusters_to_export:dict, sample_name:str = None):
# # Check if the sample name is in the DataFrame
# all_samples = list(features.keys())
# if sample_name is not None:
# if sample_name not in all_samples:
# raise ValueError(f"Sample {sample_name} not found in annotated clusters. Available samples: {', '.join(all_samples)}")
# cluster_data = []
# for sample, clusters in clusters_to_export.items():
# if sample_name is None or sample_name == sample: # Filter the DataFrame by sample name if provided
# for cname, cluster in clusters.items():
# for feature in cluster.features:
# idx = [i for i,j in enumerate(feature.metabolite) if j == cname][0]
# # Get the cluster_id of the features in another cluster
# other_clusters = [c.cluster_id for cluster_name, c in clusters.items() if feature in c.features and c.cluster_id != cluster.cluster_id]
# cluster_data.append({
# "cluster_id": cluster.cluster_id,
# "metabolite": cluster.name,
# "feature_id": feature.feature_id,
# "mz": feature.mz,
# "rt": feature.rt,
# "feature_potential_metabolite": feature.metabolite,
# # "isotopologue": feature.isotopologue[idx],
# "isotopologue": feature.cluster_isotopologue[cluster.name],
# "mz_error": feature.mz_error[idx],
# "rt_error": feature.rt_error[idx],
# "sample": feature.sample,
# "intensity": feature.intensity,
# "status": cluster.status,
# "missing_isotopologue": cluster.missing_isotopologues,
# "duplicated_isotopologue": cluster.duplicated_isotopologues,
# # "in_cluster": feature.in_cluster,
# "in_another_cluster": other_clusters
# })
# # Create a DataFrame to summarize the annotated clusters
# df = pd.DataFrame(cluster_data)
# # Export the DataFrame to a tsv file if a filename is provided
# # if filename:
# df.to_csv(f"{self.outputs_path}/{self.dataset_name}.clusters.tsv", sep="\t", index=False)
# # return df
# def export_clusters_to_tsv(self, filepath: str):
# """
# Export the clusters to a CSV file.
# :param filepath: str
# """
# df = self.clusters_to_dataframe()
# df.to_csv(filepath, sep="\t", index=False)
# if __name__ == "__main__":
# test = IoHandler(
# )
# print(test.read_dataset(r"C:\Users\kouakou\Documents\IsoGroup_test\data\dataset_test_XCMS.txt"))
# print(test.outputs_path)
# print(test.tracer)
# print(test._tracer_element)
# test.export_annotated_features()
# print(test.samples)