Source code for nucml.processing

import logging
import numpy as np
import pandas as pd
from joblib import load
from scipy.optimize import curve_fit
from sklearn import preprocessing

from nucml.general_utilities import func  # pylint: disable=import-error

pd.options.mode.chained_assignment = None  # default='warn'

[docs]def impute_values(df): """Imputes feature values using linear interpolation element-wise. The passed dataframe must contain both the number of protons and mass number as "Z" and "A" respetively. Args: df (pd.DataFrame): DataFrame to impute values off. All missing values will be filled. Returns: pd.DataFrame: New imputed DataFrame. """ for i in range(0,119): df[df["Z"] == i] = df[df["Z"] == i].sort_values(by="A").interpolate() if len(df[df["Z"] == i]) > 1: fit_df_original = df[df["Z"] == i].sort_values(by="A").reset_index(drop=True).copy() fit_df = fit_df_original.copy() col_params = {} guess = (0.5, 0.5) # Curve fit each column for col in fit_df.select_dtypes(np.number).columns: if len(fit_df[col].dropna()) > 1: # SHOULD IT BE 0? # Get x & y x = fit_df[col].dropna().index.astype(float).values y = fit_df[col].dropna().values # Curve fit column and get curve parameters params = curve_fit(func, x, y, guess) # Store optimized parameters col_params[col] = params[0] # Extrapolate each column for col in col_params.keys(): # Get the index values for NaNs in the column x = fit_df_original[pd.isnull(fit_df_original[col])].index.astype(float).values # Extrapolate those points with the fitted function fit_df_original[col][x] = func(x, *col_params[col]) df[df["Z"] == i] = fit_df_original.values return df
[docs]def normalize_features(df, to_scale, scaling_type="standard", scaler_dir=None): """Applies a transformer or normalizer to a set of specific features in the provided dataframe. Args: df (pd.DataFrame): DataFrame to normalize/transform. to_scale (list): List of columns to apply the normalization to. scaling_type (str): Scaling or transformer to use. Options include "poweryeo", "standard", "minmax", "maxabs", "robust", and "quantilenormal". See the scikit-learn documentation for more information on each of these. scaler_dir (str): Path-like string to a previously saved scaler. If provided, this overides any other parameter by loading the scaler from the provided path and using it to transform the provided dataframe. Defaults to None. Returns: object: Scikit-learn scaler object. """ if scaler_dir is not None: logging.info("Using previously saved scaler.") scaler_object = load(open(scaler_dir, 'rb')) else: logging.info("Fitting new scaler.") if scaling_type == "poweryeo": scaler_object = preprocessing.PowerTransformer().fit(df[to_scale]) elif scaling_type == "standard": scaler_object = preprocessing.StandardScaler().fit(df[to_scale]) elif scaling_type == "minmax": scaler_object = preprocessing.MinMaxScaler().fit(df[to_scale]) elif scaling_type == "maxabs": scaler_object = preprocessing.MaxAbsScaler().fit(df[to_scale]) elif scaling_type == 'robust': scaler_object = preprocessing.RobustScaler().fit(df[to_scale]) elif scaling_type == 'quantilenormal': scaler_object = preprocessing.QuantileTransformer(output_distribution='normal').fit(df[to_scale]) return scaler_object