import marimo as mo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from IPython.display import Markdown, display

df = pd.read_csv(
    "https://raw.githubusercontent.com/c2p-cmd/EthicalIssuesOfAI/refs/heads/main/bank_marketing_data.csv"
)
df.head()

Markdown(f"""### **Observation** The dataset has {len(df)} samples with {len(df.columns)} columns.""")

df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  poutcome     8252 non-null   object
 16  y            45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

pd.DataFrame(df.isnull().sum(), columns=["Count"])

df[df.isnull().sum()[df.isnull().sum() != 0].index.tolist()].head(10)

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df["job"] = df["job"].fillna("unknown")
    df["education"] = df["education"].fillna("unknown")
    df = df.drop(columns=["contact"])
    df["poutcome"] = df["poutcome"].fillna("not-contacted")
    return df


cleaned_df = df.pipe(clean_data)
cleaned_df

pd.DataFrame(cleaned_df.isnull().sum(), columns=["Count"])

len(cleaned_df), len(cleaned_df.columns)

(45211, 16)

features = cleaned_df.drop(columns="y").columns.tolist()
pd.DataFrame(features, columns=["Features of dataset"])

cleaned_df[features].describe(include=[np.number])

cleaned_df[features].describe(include=["object"])

plt.figure(figsize=(24, 26))
for f in features:
    if cleaned_df[f].dtype == "object":
        plt.subplot(4, 4, features.index(f) + 1)
        plt.pie(
            cleaned_df[f].value_counts(),
            autopct="%1.1f%%",
            labels=cleaned_df[f].value_counts().index,
            colors=sns.color_palette("pastel"),
        )
        plt.title(f"Distribution of {f}")
        plt.xticks(rotation=45)
        plt.grid()
    else:
        plt.subplot(4, 4, features.index(f) + 1)
        sns.histplot(cleaned_df[f], bins=30, kde=True, color="skyblue")
        plt.title(f"Distribution of {f}")
        plt.grid()
plt.show()

plt.figure(figsize=(18, 12))
sensitive_attributes = ["age", "marital", "job", "education"]
for _attr in sensitive_attributes:
    plt.subplot(2, 2, sensitive_attributes.index(_attr) + 1)
    if cleaned_df[_attr].dtype == "object":
        sns.countplot(data=cleaned_df, x=_attr, hue="y", palette="Set2")
        plt.xticks(rotation=45)
    else:
        sns.histplot(
            data=cleaned_df,
            x=_attr,
            hue="y",
            multiple="stack",
            bins=30,
            palette="Set2",
        )
    plt.title(f"{_attr} vs Target Variable")
    if _attr == "age":
        plt.ylabel("Count")
    else:
        plt.ylabel("")
    plt.grid()
plt.show()

plt.figure(figsize=(21, 18))
non_sensitive_attributes = cleaned_df.drop(
    columns=["y"] + sensitive_attributes
).columns.tolist()
for _attr in non_sensitive_attributes:
    plt.subplot(4, 3, non_sensitive_attributes.index(_attr) + 1)
    if cleaned_df[_attr].dtype == "object":
        sns.countplot(data=cleaned_df, x=_attr, hue="y", palette="Set2")
    else:
        sns.histplot(
            data=cleaned_df,
            x=_attr,
            hue="y",
            multiple="stack",
            bins=30,
            palette="Set2",
        )
    plt.title(f"{_attr} vs Target Variable")
    if _attr == "default":
        plt.ylabel("Count")
    else:
        plt.ylabel("")
    plt.xlabel("")
    plt.grid()
plt.show()

X = cleaned_df.drop(columns=["duration", "day_of_week", "default", "y"])
y = cleaned_df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=19,
    stratify=y,
)

Markdown(
    f"""
### Data Sizes

* X train shape: `{X_train.shape}`
* X test shape: `{X_test.shape}`
* y train shape: `{y_train.shape}`
* y test shape: `{X_test.shape}`
"""
)

from IPython.display import JSON

numerical_features = X_train.select_dtypes(
    include=[np.number]
).columns.tolist()
categorical_features = X_train.select_dtypes(
    exclude=[np.number]
).columns.tolist()

JSON({
    "Numerical features": numerical_features,
    "Categorical features": categorical_features,
})

<IPython.core.display.JSON object>

plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
plt.pie(
    y_train.value_counts(),
    autopct="%1.1f%%",
    labels=y_train.value_counts().index,
    colors=sns.color_palette("pastel"),
)
plt.title("Training Distribution of Target")
plt.grid()

plt.subplot(1, 2, 2)
plt.pie(
    y_test.value_counts(),
    autopct="%1.1f%%",
    labels=y_train.value_counts().index,
    colors=sns.color_palette("pastel"),
)
plt.title("Test Distribution of Target")
plt.grid()

plt.show()

from sklearn.utils import compute_class_weight

class_names = np.unique(y)
weights = dict(
    zip(
        class_names,
        compute_class_weight(
            class_weight="balanced",
            y=y_train,
            classes=class_names,
        ),
    )
)
JSON(weights)

<IPython.core.display.JSON object>

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [
        ("cat", OrdinalEncoder(), categorical_features),
        ("num", MinMaxScaler(), numerical_features),
    ]
)

svm = Pipeline(
    steps=[
        ("preprocessor", ct),
        (
            "classifier",
            SVC(
                random_state=19,
                # gamma="auto",
                class_weight=weights,
            ),
        ),
    ]
)

random_forest = Pipeline(
    steps=[
        ("preprocessor", ct),
        (
            "classifier",
            RandomForestClassifier(
                n_estimators=300,
                random_state=19,
                criterion="log_loss",
                class_weight=weights,
            ),
        ),
    ]
)

logistic_regression = Pipeline(
    steps=[
        ("preprocessor", ct),
        (
            "classifier",
            LogisticRegression(
                class_weight=weights,
                random_state=19,
                solver="newton-cholesky",
                max_iter=10_000,
            ),
        ),
    ]
)

svm

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['job', 'marital',
                                                   'education', 'housing',
                                                   'loan', 'month',
                                                   'poutcome']),
                                                 ('num', MinMaxScaler(),
                                                  ['age', 'balance', 'campaign',
                                                   'pdays', 'previous'])])),
                ('classifier',
                 SVC(class_weight={'no': np.float64(0.5662397845758838),
                                   'yes': np.float64(4.27416686362562)},
                     random_state=19))])

['job', 'marital', 'education', 'housing', 'loan', 'month', 'poutcome']

['age', 'balance', 'campaign', 'pdays', 'previous']

random_forest

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['job', 'marital',
                                                   'education', 'housing',
                                                   'loan', 'month',
                                                   'poutcome']),
                                                 ('num', MinMaxScaler(),
                                                  ['age', 'balance', 'campaign',
                                                   'pdays', 'previous'])])),
                ('classifier',
                 RandomForestClassifier(class_weight={'no': np.float64(0.5662397845758838),
                                                      'yes': np.float64(4.27416686362562)},
                                        criterion='log_loss', n_estimators=300,
                                        random_state=19))])

['job', 'marital', 'education', 'housing', 'loan', 'month', 'poutcome']

['age', 'balance', 'campaign', 'pdays', 'previous']

logistic_regression

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['job', 'marital',
                                                   'education', 'housing',
                                                   'loan', 'month',
                                                   'poutcome']),
                                                 ('num', MinMaxScaler(),
                                                  ['age', 'balance', 'campaign',
                                                   'pdays', 'previous'])])),
                ('classifier',
                 LogisticRegression(class_weight={'no': np.float64(0.5662397845758838),
                                                  'yes': np.float64(4.27416686362562)},
                                    max_iter=10000, random_state=19,
                                    solver='newton-cholesky'))])

['job', 'marital', 'education', 'housing', 'loan', 'month', 'poutcome']

['age', 'balance', 'campaign', 'pdays', 'previous']

from tqdm import tqdm

predictions = []

for _model in tqdm([svm, random_forest, logistic_regression], desc="Training Model", unit="model"):
    _model.fit(X_train, y_train)
    predictions.append(_model.predict(X_test))

Markdown("### Training Complete")

Training Model: 100%|███████████████████████████████████████████████| 3/3 [00:28<00:00,  9.53s/model]

from sklearn.metrics import classification_report

_reports = []

for _name, _preds in tqdm(zip(["SVM", "Random Forest", "Logistic Regression"], predictions), desc='Predicting...'):
    _reports.append(
        pd.DataFrame(
            classification_report(
                y_test,
                _preds,
                output_dict=True,
            )
        )
    )

for _name, _report in zip(["SVM", "Random Forest", "Logistic Regression"], _reports):
    display(Markdown(f"### Classification Report for {_name}"))
    display(_report)
    display()

Predicting...: 3it [00:00, 27.29it/s]

pd.DataFrame(sensitive_attributes, columns=["Sensitive Attributes"])

def disparate_impact(y_pred, name, feature):
    eval_df = pd.DataFrame(
        {
            feature: X_test[feature],
            "Prediction": y_pred,
        }
    )
    disparity = (
        eval_df.groupby([feature, "Prediction"]).size().unstack(fill_value=0)
    )
    disparity["Total"] = disparity.sum(axis=1)
    disparity["Proportion No"] = (disparity["no"] / disparity["Total"]) * 100
    disparity["Proportion Yes"] = (disparity["yes"] / disparity["Total"]) * 100
    display(Markdown(f"## Disparate Impact on **{feature}** for **{name}**"))
    return disparity

for _name, _preds in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    predictions,
):
    display(disparate_impact(_preds, _name, feature="marital"))
    display(disparate_impact(_preds, _name, feature="education"))

from sklearn.metrics import accuracy_score


def disparity_mistreatment(y_pred, name, feature):
    eval_df = pd.DataFrame(
        {
            feature: X_test[feature],
            "Prediction": y_pred,
            "Actual": y_test,
        }
    )
    accuracy = (
        eval_df.groupby(feature)
        .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))
        .rename("Accuracy")
        .reset_index()
    )
    accuracy["Accuracy"] = accuracy["Accuracy"] * 100
    display(Markdown(f"## Disparity Mistreatment (Accuracy) on **{feature}** for **{name}**"))
    return accuracy

for _name, _preds in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    predictions,
):
    display(disparity_mistreatment(_preds, _name, "marital"))
    display(disparity_mistreatment(_preds, _name, "education"))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/604544057.py:14: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: accuracy_score(x["Actual"], x["Prediction"]))

def disparity_treatment(y_pred, name, feature):
    eval_df = pd.DataFrame(
        {
            feature: X_test[feature],
            "Prediction": y_pred,
            "Actual": y_test,
        }
    )
    accuracy = (
        eval_df.groupby(feature)
        .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())
        .rename("Error Rate")
        .reset_index()
    )
    display(Markdown(f"## Disparity Treatment on **{feature}** for **{name}**"))
    return accuracy


for _name, _preds in zip(
    ["SVM", "Random Forest", "Logistic Regression"],
    predictions,
):
    display(disparity_treatment(_preds, _name, "marital"))
    display(disparity_treatment(_preds, _name, "education"))

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

/var/folders/r8/wfzbzqkx22z5qjdyjqnmsf740000gn/T/ipykernel_46160/106816596.py:11: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: (x["Actual"] != x["Prediction"]).mean())

	job	education	contact	poutcome
0	management	tertiary	NaN	NaN
1	technician	secondary	NaN	NaN
2	entrepreneur	secondary	NaN	NaN
3	blue-collar	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN
5	management	tertiary	NaN	NaN
6	management	tertiary	NaN	NaN
7	entrepreneur	tertiary	NaN	NaN
8	retired	primary	NaN	NaN
9	technician	secondary	NaN	NaN

	age	balance	day_of_week	duration	campaign	pdays	previous
count	45211.000000	45211.000000	45211.000000	45211.000000	45211.000000	45211.000000	45211.000000
mean	40.936210	1362.272058	15.806419	258.163080	2.763841	40.197828	0.580323
std	10.618762	3044.765829	8.322476	257.527812	3.098021	100.128746	2.303441
min	18.000000	-8019.000000	1.000000	0.000000	1.000000	-1.000000	0.000000
25%	33.000000	72.000000	8.000000	103.000000	1.000000	-1.000000	0.000000
50%	39.000000	448.000000	16.000000	180.000000	2.000000	-1.000000	0.000000
75%	48.000000	1428.000000	21.000000	319.000000	3.000000	-1.000000	0.000000
max	95.000000	102127.000000	31.000000	4918.000000	63.000000	871.000000	275.000000

	steps steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.	[('preprocessor', ...), ('classifier', ...)]
	transform_input transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6	None
	memory memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.	None
	verbose verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.	False

	transformers transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str Like in Pipeline and FeatureUnion, this allows the transformer and its parameters to be set using ``set_params`` and searched in grid search. transformer : {'drop', 'passthrough'} or estimator Estimator must support :term:`fit` and :term:`transform`. Special-cased strings 'drop' and 'passthrough' are accepted as well, to indicate to drop the columns or to pass them through untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where ``transformer`` expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. A callable is passed the input data `X` and can return any of the above. To select multiple columns by name or dtype, you can use :obj:`make_column_selector`.	[('cat', ...), ('num', ...)]
	remainder remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.	'drop'
	sparse_threshold sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.	0.3
	n_jobs n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	transformer_weights transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.	None
	verbose verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.	False
	verbose_feature_names_out verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix all feature names with the name of the transformer that generated that feature. It is equivalent to setting `verbose_feature_names_out="{transformer_name}__{feature_name}"`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not prefix any feature names and will error if feature names are not unique. - If ``Callable[[str, str], str]``, :meth:`ColumnTransformer.get_feature_names_out` will rename all the features using the name of the transformer. The first argument of the callable is the transformer name and the second argument is the feature name. The returned string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will be formatted using two field names: ``transformer_name`` and ``feature_name``. e.g. ``"{feature_name}__{transformer_name}"``. See :meth:`str.format` method from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6 `verbose_feature_names_out` can be a callable or a string to be formatted.	True
	force_int_remainder_cols force_int_remainder_cols: bool, default=False This parameter has no effect. .. note:: If you do not access the list of columns for the remainder columns in the `transformers_` fitted attribute, you do not need to set this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7 The default value for `force_int_remainder_cols` will change from `True` to `False` in version 1.7. .. deprecated:: 1.7 `force_int_remainder_cols` is deprecated and will be removed in 1.9.	'deprecated'

	categories categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute.	'auto'
	dtype dtype: number type, default=np.float64 Desired dtype of output.	<class 'numpy.float64'>
	handle_unknown handle_unknown: {'error', 'use_encoded_value'}, default='error' When set to 'error' an error will be raised in case an unknown categorical feature is present during transform. When set to 'use_encoded_value', the encoded value of unknown categories will be set to the value given for the parameter `unknown_value`. In :meth:`inverse_transform`, an unknown category will be denoted as None. .. versionadded:: 0.24	'error'
	unknown_value unknown_value: int or np.nan, default=None When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of the categories in `fit`. If set to np.nan, the `dtype` parameter must be a float dtype. .. versionadded:: 0.24	None
	encoded_missing_value encoded_missing_value: int or np.nan, default=np.nan Encoded value of missing categories. If set to `np.nan`, then the `dtype` parameter must be a float dtype. .. versionadded:: 1.1	nan
	min_frequency min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered infrequent. - If `float`, categories with a smaller cardinality than `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.3 Read more in the :ref:`User Guide `.	None
	max_categories max_categories: int, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. `max_categories` do not take into account missing or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an integer will increase the number of unique integer codes by one each. This can result in up to `max_categories + 2` integer codes. .. versionadded:: 1.3 Read more in the :ref:`User Guide `.	None

	age	job	marital	education	default	balance	housing	loan	contact	day_of_week	month	duration	campaign	pdays	poutcome	y
0	58	management	married	tertiary	no	2143	yes	no	NaN	5	may	261	1	-1	NaN	no
1	44	technician	single	secondary	no	29	yes	no	NaN	5	may	151	1	-1	NaN	no
2	33	entrepreneur	married	secondary	no	2	yes	yes	NaN	5	may	76	1	-1	NaN	no
3	47	blue-collar	married	NaN	no	1506	yes	no	NaN	5	may	92	1	-1	NaN	no
4	33	NaN	single	NaN	no	1	no	no	NaN	5	may	198	1	-1	NaN	no

	Count
age	0
job	288
marital	0
education	1857
default	0
balance	0
housing	0
loan	0
contact	13020
day_of_week	0
month	0
duration	0
campaign	0
pdays	0
previous	0
poutcome	36959
y	0

	job	marital	education	default	housing	loan	month	poutcome
count	45211	45211	45211	45211	45211	45211	45211	45211
unique	12	3	4	2	2	2	12	4
top	blue-collar	married	secondary	no	yes	no	may	not-contacted
freq	9732	27214	23202	44396	25130	37967	13766	36959

	feature_range feature_range: tuple (min, max), default=(0, 1) Desired range of transformed data.	(0, ...)
	copy copy: bool, default=True Set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array).	True
	clip clip: bool, default=False Set to True to clip transformed values of held-out data to provided `feature_range`. Since this parameter will clip values, `inverse_transform` may not be able to restore the original data. .. note:: Setting `clip=True` does not prevent feature drift (a distribution shift between training and test data). The transformed values are clipped to the `feature_range`, which helps avoid unintended behavior in models sensitive to out-of-range inputs (e.g. linear models). Use with care, as clipping can distort the distribution of test data. .. versionadded:: 0.24	False

	C C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.	1.0
	kernel kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.	'rbf'
	degree degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.	3
	gamma gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'.	'scale'
	coef0 coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.	0.0
	shrinking shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.	True
	probability probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.	False
	tol tol: float, default=1e-3 Tolerance for stopping criterion.	0.001
	cache_size cache_size: float, default=200 Specify the size of the kernel cache (in MB).	200
	class_weight class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]C for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes np.bincount(y))``.	{'no': np.float64(0.5662397845758838), 'yes': np.float64(4.27416686362562)}
	verbose verbose: bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context.	False
	max_iter max_iter: int, default=-1 Hard limit on iterations within solver, or -1 for no limit.	-1
	decision_function_shape decision_function_shape: {'ovo', 'ovr'}, default='ovr' Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, note that internally, one-vs-one ('ovo') is always used as a multi-class strategy to train models; an ovr matrix is only constructed from the ovo matrix. The parameter is ignored for binary classification. .. versionchanged:: 0.19 decision_function_shape is 'ovr' by default. .. versionadded:: 0.17 decision_function_shape='ovr' is recommended. .. versionchanged:: 0.17 Deprecated decision_function_shape='ovo' and None.	'ovr'
	break_ties break_ties: bool, default=False If true, ``decision_function_shape='ovr'``, and number of classes > 2, :term:`predict` will break ties according to the confidence values of :term:`decision_function`; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict. See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an example of its usage with ``decision_function_shape='ovr'``. .. versionadded:: 0.22	False
	random_state random_state: int, RandomState instance or None, default=None Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when `probability` is False. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.	19

	n_estimators n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22 The default value of ``n_estimators`` changed from 10 to 100 in 0.22.	300
	criterion criterion: {"gini", "entropy", "log_loss"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "log_loss" and "entropy" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.	'log_loss'
	max_depth max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.	None
	min_samples_split min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for fractions.	2
	min_samples_leaf min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions.	1
	min_weight_fraction_leaf min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.	0.0
	max_features max_features: {"sqrt", "log2", None}, int or float, default="sqrt" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1 The default of `max_features` changed from `"auto"` to `"sqrt"`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.	'sqrt'
	max_leaf_nodes max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.	None
	min_impurity_decrease min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19	0.0
	bootstrap bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.	True
	oob_score oob_score: bool or callable, default=False Whether to use out-of-bag samples to estimate the generalization score. By default, :func:`~sklearn.metrics.accuracy_score` is used. Provide a callable with signature `metric(y_true, y_pred)` to use a custom metric. Only available if `bootstrap=True`. For an illustration of out-of-bag (OOB) error estimation, see the example :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`.	False
	n_jobs n_jobs: int, default=None The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, :meth:`decision_path` and :meth:`apply` are all parallelized over the trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	random_state random_state: int, RandomState instance or None, default=None Controls both the randomness of the bootstrapping of the samples used when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). See :term:`Glossary ` for details.	19
	verbose verbose: int, default=0 Controls the verbosity when fitting and predicting.	0
	warm_start warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`Glossary ` and :ref:`tree_ensemble_warm_start` for details.	False
	class_weight class_weight: {"balanced", "balanced_subsample"}, dict or list of dicts, default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.	{'no': np.float64(0.5662397845758838), 'yes': np.float64(4.27416686362562)}
	ccp_alpha ccp_alpha: non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. See :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` for an example of such pruning. .. versionadded:: 0.22	0.0
	max_samples max_samples: int or float, default=None If bootstrap is True, the number of samples to draw from X to train each base estimator. - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22	None
	monotonic_cst monotonic_cst: array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase - 0: no constraint - -1: monotonic decrease If monotonic_cst is None, no constraints are applied. Monotonicity constraints are not supported for: - multiclass classifications (i.e. when `n_classes > 2`), - multioutput classifications (i.e. when `n_outputs_ > 1`), - classifications trained on data with missing values. The constraints hold over the probability of the positive class. Read more in the :ref:`User Guide `. .. versionadded:: 1.4	None

	penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning:: Some penalties may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8 `penalty` was deprecated in version 1.8 and will be removed in 1.10. Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for `'penalty='elasticnet'`.	'deprecated'
	C C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.	1.0
	l1_ratio l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning:: Certain values of `l1_ratio`, i.e. some penalties, may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionchanged:: 1.8 Default value changed from None to 0.0. .. deprecated:: 1.8 `None` is deprecated and will be removed in version 1.10. Always use `l1_ratio` to specify the penalty type.	0.0
	dual dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.	False
	tol tol: float, default=1e-4 Tolerance for stopping criteria.	0.0001
	fit_intercept fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.	True
	intercept_scaling intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a "synthetic" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note:: The synthetic feature weight is subject to L1 or L2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) `intercept_scaling` has to be increased.	1
	class_weight class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 class_weight='balanced'	{'no': np.float64(0.5662397845758838), 'yes': np.float64(4.27416686362562)}
	random_state random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.	19
	solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except 'liblinear' minimize the full multinomial loss, 'liblinear' will raise an error. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features * n_classes`, especially with one-hot encoded categorical features with rare categories. Be aware that the memory usage of this solver has a quadratic dependency on `n_features * n_classes` because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a one-versus-rest scheme for the multiclass setting one can wrap it with the :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning:: The choice of the algorithm depends on the penalty chosen (`l1_ratio=0` for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for Elastic-Net) and on (multinomial) multiclass support: ================= ======================== ====================== solver l1_ratio multinomial multiclass ================= ======================== ====================== 'lbfgs' l1_ratio=0 yes 'liblinear' l1_ratio=1 or l1_ratio=0 no 'newton-cg' l1_ratio=0 yes 'newton-cholesky' l1_ratio=0 yes 'sag' l1_ratio=0 yes 'saga' 0<=l1_ratio<=1 yes ================= ======================== ====================== .. note:: 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from :mod:`sklearn.preprocessing`. .. seealso:: Refer to the :ref:`User Guide ` for more information regarding :class:`LogisticRegression` and more specifically the :ref:`Table ` summarizing solver/penalty supports. .. versionadded:: 0.17 Stochastic Average Gradient (SAG) descent solver. Multinomial support in version 0.18. .. versionadded:: 0.19 SAGA solver. .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. Multinomial support in version 1.6.	'newton-cholesky'
	max_iter max_iter: int, default=100 Maximum number of iterations taken for the solvers to converge.	10000
	verbose verbose: int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.	0
	warm_start warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. .. versionadded:: 0.17 warm_start to support lbfgs, newton-cg, sag, saga solvers.	False
	n_jobs n_jobs: int, default=None Does not have any effect. .. deprecated:: 1.8 `n_jobs` is deprecated in version 1.8 and will be removed in 1.10.	None

	no	yes	accuracy	macro avg	weighted avg
precision	0.930701	0.216060	0.698441	0.573381	0.847091
recall	0.711459	0.600189	0.698441	0.655824	0.698441
f1-score	0.806445	0.317738	0.698441	0.562092	0.749268
support	7985.000000	1058.000000	0.698441	9043.000000	9043.000000

	no	yes	accuracy	macro avg	weighted avg
precision	0.904372	0.612299	0.892292	0.758336	0.870200
recall	0.981841	0.216446	0.892292	0.599144	0.892292
f1-score	0.941516	0.319832	0.892292	0.630674	0.868781
support	7985.000000	1058.000000	0.892292	9043.000000	9043.000000

	no	yes	accuracy	macro avg	weighted avg
precision	0.933299	0.207688	0.674444	0.570493	0.848405
recall	0.679900	0.633270	0.674444	0.656585	0.674444
f1-score	0.786698	0.312792	0.674444	0.549745	0.731252
support	7985.000000	1058.000000	0.674444	9043.000000	9043.000000

Prediction	no	yes	Total	Proportion No	Proportion Yes
marital
divorced	741	310	1051	70.504282	29.495718
married	3858	1626	5484	70.350109	29.649891
single	1505	1003	2508	60.007974	39.992026

Prediction	no	yes	Total	Proportion No	Proportion Yes
education
primary	1127	247	1374	82.023290	17.976710
secondary	3449	1221	4670	73.854390	26.145610
tertiary	1351	1297	2648	51.019637	48.980363
unknown	177	174	351	50.427350	49.572650

Prediction	no	yes	Total	Proportion No	Proportion Yes
marital
divorced	1011	40	1051	96.194101	3.805899
married	5290	194	5484	96.462436	3.537564
single	2368	140	2508	94.417863	5.582137

Prediction	no	yes	Total	Proportion No	Proportion Yes
education
primary	1323	51	1374	96.288210	3.711790
secondary	4520	150	4670	96.788009	3.211991
tertiary	2489	159	2648	93.995468	6.004532
unknown	337	14	351	96.011396	3.988604

Prediction	no	yes	Total	Proportion No	Proportion Yes
education
primary	1101	273	1374	80.131004	19.868996
secondary	3241	1429	4670	69.400428	30.599572
tertiary	1335	1313	2648	50.415408	49.584592
unknown	140	211	351	39.886040	60.113960

	education	Accuracy
0	primary	82.823872
1	secondary	73.447537
2	tertiary	58.345921
3	unknown	57.834758

	education	Accuracy
0	primary	90.829694
1	secondary	90.299786
2	tertiary	86.744713
3	unknown	87.464387

	education	Accuracy
0	primary	80.349345
1	secondary	69.978587
2	tertiary	58.496979
3	unknown	50.712251

	education	Error Rate
0	primary	0.171761
1	secondary	0.265525
2	tertiary	0.416541
3	unknown	0.421652

	education	Error Rate
0	primary	0.091703
1	secondary	0.097002
2	tertiary	0.132553
3	unknown	0.125356

	education	Error Rate
0	primary	0.196507
1	secondary	0.300214
2	tertiary	0.415030
3	unknown	0.492877

Factor	SVM	Random Forest	Logistic Regression	Notes
Overall Accuracy	⭐⭐⭐⭐	⭐⭐⭐⭐⭐	⭐⭐⭐	Random Forest achieves highest overall accuracy
Fairness - Marital Status	⭐⭐⭐⭐	⭐⭐⭐	⭐⭐⭐⭐	SVM and LR have more consistent performance across marital groups
Fairness - Education	⭐⭐⭐	⭐⭐	⭐⭐⭐⭐	LR shows smallest disparity across education levels
Computational Efficiency	⭐⭐	⭐	⭐⭐⭐⭐⭐	LR is significantly more efficient for large-scale deployment
Interpretability	⭐⭐	⭐⭐⭐	⭐⭐⭐⭐⭐	LR coefficients directly indicate feature importance
Robustness to Outliers	⭐⭐⭐⭐⭐	⭐⭐⭐⭐	⭐⭐	SVM is least affected by outliers
Scalability	⭐⭐	⭐⭐⭐	⭐⭐⭐⭐⭐	LR scales better to large datasets
Regulatory Compliance	⭐⭐	⭐⭐⭐	⭐⭐⭐⭐⭐	LR's interpretability makes it easier to explain to regulators
Ease of Updates	⭐⭐⭐	⭐⭐	⭐⭐⭐⭐⭐	LR models can be updated incrementally with new data
Bias Mitigation Potential	⭐⭐⭐	⭐⭐	⭐⭐⭐⭐⭐	LR allows for more straightforward bias mitigation strategies

Bank Marketing Dataset Ethical Analysis¶

Project by Sharan Thakur GH1031360¶

Introduction¶

GitHub Repository & Dataset:¶

Problem Statement¶

About the data:¶

Summary:¶

Variable Info:¶

Data Loading & Preprocessing¶

Observation The dataset has 45211 samples with 17 columns.¶

Observation There are missing values in job, education, contact, poutcome columns¶

Imputation Strategy¶

Observation After cleaning, the dataset has 45211 samples with 16 columns and no missing values.¶

Exploratory Data Analysis (EDA)¶

Statistical Summary of Numerical & Categorical Features¶

EDA Observations:¶

Summary:¶

age, marital, job and education are the "sensitive attributes."¶

Analysis of sensitive targets with target variable¶

Insights from the plots on sensitive attributes.¶

age vs Target Variable¶

marital vs Target Variable¶

job vs Target Variable¶

education vs Target Variable¶

Analysis of non-sensitive targets with target variable¶

Insights for the non-sensitive features.¶

default vs Target Variable¶

balance vs Target Variable¶

housing vs Target Variable¶

loan vs Target Variable¶

day_of_week vs Target Variable¶

month vs Target Variable¶

duration vs Target Variable¶

campaign vs Target Variable¶

pdays vs Target Variable¶

previous vs Target Variable¶

poutcome vs Target Variable¶

Data Preparation¶

Data Sizes¶

Observation: Both Training and testing label has 88% no and 12% yes labels¶

Due to data imbalance in target we need to compute class weights for model to perform well¶

Training Complete¶

Classification Report for SVM¶

Classification Report for Random Forest¶

Classification Report for Logistic Regression¶

Disparate Impact on marital for SVM¶

Disparate Impact on education for SVM¶

Disparate Impact on marital for Random Forest¶

Disparate Impact on education for Random Forest¶

Disparate Impact on marital for Logistic Regression¶

Disparate Impact on education for Logistic Regression¶

Observations on Disparate Impact Analysis¶

Disparate Impact on Marital Status¶

Disparate Impact on Education¶

Ethical Implications¶

Disparity Mistreatment (Accuracy) on marital for SVM¶

Disparity Mistreatment (Accuracy) on education for SVM¶

Disparity Mistreatment (Accuracy) on marital for Random Forest¶

Disparity Mistreatment (Accuracy) on education for Random Forest¶

Disparity Mistreatment (Accuracy) on marital for Logistic Regression¶

Disparity Mistreatment (Accuracy) on education for Logistic Regression¶

Observations on Disparate Mistreatment Analysis¶

Disparate Mistreatment on Marital Status¶

Disparate Mistreatment on Education¶

Ethical Implications¶

Disparity Treatment on marital for SVM¶

Disparity Treatment on education for SVM¶

Disparity Treatment on marital for Random Forest¶

Disparity Treatment on education for Random Forest¶

Disparity Treatment on marital for Logistic Regression¶

Disparity Treatment on education for Logistic Regression¶

Observations on Disparate Treatment Analysis¶

Disparate Treatment on Marital Status¶

Disparate Treatment on Education¶

Ethical Implications¶

Comparison Across Fairness Metrics¶

Summary and Mitigation Strategies¶

Overall Fairness Assessment¶

Potential Causes of Bias¶

Recommended Mitigation Strategies¶

Observation There are missing values in `job, education, contact, poutcome` columns¶

`age`, `marital`, `job` and `education` are the "sensitive attributes."¶

`age` vs Target Variable¶

`marital` vs Target Variable¶

`job` vs Target Variable¶

`education` vs Target Variable¶

`default` vs Target Variable¶

`balance` vs Target Variable¶

`housing` vs Target Variable¶

`loan` vs Target Variable¶

`day_of_week` vs Target Variable¶

`month` vs Target Variable¶

`duration` vs Target Variable¶

`campaign` vs Target Variable¶

`pdays` vs Target Variable¶

`previous` vs Target Variable¶

`poutcome` vs Target Variable¶