dimensionality_reduction_using_feature_selection.py
# -*- coding: utf-8 -*-
"""Untitled42.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1pE7KFZcxTLcnAuXUXcKSfsfskiMMPVQd
"""
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
# Load libraries
import pandas as pd
import numpy as np
# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Load libraries
# import data
iris = datasets.load_iris()
# Create features and target
features_i = iris.data
target_i = iris.target
# thresholder creation
thresholder = VarianceThreshold(threshold=.4)
# high variance feature matrix creation
f_high_variance = thresholder.fit_transform(features_i)
# View high variance feature matrix
f_high_variance[0:3]
# View variances
thresholder.fit(features_i).variances_
# feature matrix stantardization
scaler = StandardScaler()
f_std = scaler.fit_transform(features_i)
# variance of each feature calculation
selection = VarianceThreshold()
selection.fit(f_std).variances_
# feature matrix creation with:
# for Feature 0: 80% class 0
# for Feature 1: 80% class 1
# for Feature 2: 60% class 0, 40% class 1
features_i = [[0, 2, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0]]
# threshold by variance
thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))
thresholding.fit_transform(features_i)
# Create feature matrix with two highly correlated features
features_m = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Conversion of feature matrix
dataframe = pd.DataFrame(features_m)
# correlation matrix creation
corr_m = dataframe.corr().abs()
# upper triangle selection
upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),
k=1).astype(np.bool))
# For correlation greater than 0.85, Find index of feature columns
droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]
# Drop features
dataframe.drop(dataframe.columns[droping], axis=1).head(3)
# Load data
iris_i = load_iris()
features_v = iris.data
target = iris.target
# categorical data coversion
features_v = features_v.astype(int)
# Selection of two features using highest chi-squared
chi2_s = SelectKBest(chi2, k=2)
f_kbest = chi2_s.fit_transform(features_v, target)
# Show results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Selection of two features using highest F-values
f_selector = SelectKBest(f_classif, k=2)
f_kbest = f_selector.fit_transform(features_v, target)
# Pisplay results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Selection of top 65% of features
f_selector = SelectPercentile(f_classif, percentile=65)
f_kbest = f_selector.fit_transform(features_v, target)
# Display results
print("Original number of features:", features_v.shape[1])
print("Reduced number of features:", f_kbest.shape[1])
# Load libraries
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")
# features matrix, target vector, true coefficients
features_f, target_t = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)
# linear regression creation
ols = linear_model.LinearRegression()
# Recursive features elimination
rfecv = RFECV(estimator=ols, step=2, scoring="neg_mean_squared_error")
rfecv.fit(features_f, target_t)
rfecv.transform(features_f)
# Number of best features
rfecv.n_features_
# What the best categories ?
rfecv.support_
# We can even see how the features are ranked
rfecv.ranking_
help(SelectKBest)