Source code for nucml.processing

import logging
import numpy as np
import pandas as pd
from joblib import load
from scipy.optimize import curve_fit
from sklearn import preprocessing

from nucml.general_utilities import func  # pylint: disable=import-error

pd.options.mode.chained_assignment = None  # default='warn'

[docs]def impute_values(df):
    """Imputes feature values using linear interpolation element-wise. The passed dataframe 
    must contain both the number of protons and mass number as "Z" and "A" respetively. 

    Args:
        df (pd.DataFrame): DataFrame to impute values off. All missing values will be filled. 

    Returns:
        pd.DataFrame: New imputed DataFrame.
    """
    for i in range(0,119):
        df[df["Z"] == i] = df[df["Z"] == i].sort_values(by="A").interpolate()

        if len(df[df["Z"] == i]) > 1:
            fit_df_original = df[df["Z"] == i].sort_values(by="A").reset_index(drop=True).copy()
            fit_df = fit_df_original.copy()

            col_params = {}
            guess = (0.5, 0.5)

            # Curve fit each column
            for col in fit_df.select_dtypes(np.number).columns:
                if len(fit_df[col].dropna()) > 1: # SHOULD IT BE 0?
                    # Get x & y
                    x = fit_df[col].dropna().index.astype(float).values
                    y = fit_df[col].dropna().values
                    # Curve fit column and get curve parameters
                    params = curve_fit(func, x, y, guess)
                    # Store optimized parameters
                    col_params[col] = params[0]

            # Extrapolate each column
            for col in col_params.keys():
                # Get the index values for NaNs in the column
                x = fit_df_original[pd.isnull(fit_df_original[col])].index.astype(float).values
                # Extrapolate those points with the fitted function
                fit_df_original[col][x] = func(x, *col_params[col])

            df[df["Z"] == i] = fit_df_original.values
    return df

[docs]def normalize_features(df, to_scale, scaling_type="standard", scaler_dir=None):
    """Applies a transformer or normalizer to a set of specific features in the provided dataframe.

    Args:
        df (pd.DataFrame): DataFrame to normalize/transform.
        to_scale (list): List of columns to apply the normalization to.
        scaling_type (str): Scaling or transformer to use. Options include "poweryeo", "standard", 
            "minmax", "maxabs", "robust", and "quantilenormal". See the scikit-learn documentation 
            for more information on each of these.
        scaler_dir (str): Path-like string to a previously saved scaler. If provided, this overides
            any other parameter by loading the scaler from the provided path and using it to 
            transform the provided dataframe. Defaults to None.

    Returns:
        object: Scikit-learn scaler object.
    """    
    if scaler_dir is not None:
        logging.info("Using previously saved scaler.")
        scaler_object = load(open(scaler_dir, 'rb'))
    else:
        logging.info("Fitting new scaler.")
        if scaling_type == "poweryeo":
            scaler_object = preprocessing.PowerTransformer().fit(df[to_scale])
        elif scaling_type == "standard":
            scaler_object = preprocessing.StandardScaler().fit(df[to_scale])
        elif scaling_type == "minmax":
            scaler_object = preprocessing.MinMaxScaler().fit(df[to_scale])
        elif scaling_type == "maxabs":
            scaler_object = preprocessing.MaxAbsScaler().fit(df[to_scale])
        elif scaling_type == 'robust':
            scaler_object = preprocessing.RobustScaler().fit(df[to_scale])
        elif scaling_type == 'quantilenormal':
            scaler_object = preprocessing.QuantileTransformer(output_distribution='normal').fit(df[to_scale])
    return scaler_object