def flatten_list(lst): """ Flattens a nested list into a single list. If the input is not nested, it returns the original list. Handles cases where some elements are lists and others are not. """ if not isinstance(lst, list): raise ValueError("You must provide a valid list") def _flatten(sublist): for item in sublist: if isinstance(item, list): yield from _flatten(item) else: yield item return list(_flatten(lst)) def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict: """Flatten a nested dictionary efficiently. Args: d (dict): The dictionary to flatten. parent_key (str): The base key string to use for the flattened keys. sep (str): The separator to use between parent and child keys. Returns: dict: The flattened dictionary. """ if not isinstance(d, dict): raise ValueError("You must provide a valid dictionary.") def _flatten(d, parent_key): for k, v in d.items(): new_key = f"{parent_key}{sep}{k}" if parent_key else k if isinstance(v, dict): yield from _flatten(v, new_key) else: yield new_key, v return dict(_flatten(d, parent_key)) def filter_dict_by_keys(original_dict, relevant_keys): """ Filters a dictionary to include only the key-value pairs where the key is in relevant_keys. Args: original_dict (dict): The dictionary to filter. relevant_keys (set): The set of keys to keep. Returns: dict: A filtered dictionary containing only the relevant key-value pairs. """ return {key: original_dict[key] for key in relevant_keys if key in original_dict} from typing import List import pandas as pd def custom_struct_to_df(samples: List[List[pd.DataFrame]]): """ Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame. Args: samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of: - A preprint DataFrame (e.g., containing information about preprints). - An article DataFrame (e.g., containing information about corresponding articles). Returns: pd.DataFrame: A single DataFrame where: - Each row corresponds to a preprint-article pair. - Preprint columns retain their prefix (e.g., 'prpnt'). - Article columns retain their prefix (e.g., 'article'). - Index is reset for the entire DataFrame. """ return pd.concat([ pd.concat([preprint, article], axis=1) for preprint, article in samples ]).reset_index(drop=True) def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]: """ Converts a DataFrame with prefixed columns (prpnt for preprint, article for article) into a list of pairs of DataFrames. Args: df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`. Returns: List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article]. """ # Split columns into preprint and article based on prefixes preprint_columns = [col for col in df.columns if col.startswith("prpnt")] article_columns = [col for col in df.columns if col.startswith("article")] # Separate the DataFrame into two DataFrames for preprint and article preprint_df = df[preprint_columns].copy() article_df = df[article_columns].copy() # Combine rows into pairs of DataFrames return [ [preprint_df.iloc[[i]], article_df.iloc[[i]]] # Use iloc to get each row as a DataFrame for i in range(len(df)) ]