Code for Dimensionality Reduction: Feature Extraction using Scikit-learn in Python Tutorial

View on Github

# -*- coding: utf-8 -*-
Automatically generated by Colaboratory.
Original file is located at

from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Load the data
digits = datasets.load_digits()
# Feature matrix standardization
features = StandardScaler().fit_transform(
# Perform PCA While retaining 80% of variance
pca = PCA(n_components=0.95, whiten=True)
# perform PCA
pcafeatures = pca.fit_transform(features)
# Display results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", pcafeatures.shape[1])

# Creation of the linearly inseparable data
features, _ = make_circles(n_samples=2000, random_state=1, noise=0.1, factor=0.1)
# kernal PCA with radius basis function (RBF) kernel application
k_pca = KernelPCA(kernel="rbf", gamma=16, n_components=1)
k_pcaf = k_pca.fit_transform(features)
print("Original number of features:", features.shape[1])
print("Reduced number of features:", k_pcaf.shape[1])

#flower dataset loading:
iris = datasets.load_iris()
features =
target =
# Creation of LDA. Use of LDA for features transformation
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda =, target).transform(features)
# Print the number of features
print("number of features(original):", features.shape[1])
print("number of features that was reduced:", features_lda.shape[1])


# Load Iris flower dataset:
iris123 = datasets.load_iris()
features =
target =
# Create and run LDA
lda_r = LinearDiscriminantAnalysis(n_components=None)
features_lda =, target)
# array of explained variance ratios
lda_var_r = lda_r.explained_variance_ratio_
# function ceration
def select_n_c(v_ratio, g_var: float) -> int:
    # initial variance explained setting
    total_v = 0.0
    # number of features initialisation
    n_components = 0
    # If we consider explained variance of each feature:
    for explained_v in v_ratio:
        # explained variance addition to the total
        total_v += explained_v
        # add one to number of components
        n_components += 1
        # we attain our goal level of explained variance
        if total_v >= g_var:
            # end the loop
    # return the number of components
    return n_components

# run the function
select_n_c(lda_var_r, 0.95)

# data loading
digit = datasets.load_digits()
# feature matrix loading
feature_m =
# Creation, fit and application of NMF
n_mf = NMF(n_components=12, random_state=1)
features_nmf = n_mf.fit_transform(feature_m)
# Show results
print("Original number of features:", feature_m.shape[1])
print("Reduced number of features:", features_nmf.shape[1])

# data loading
digit123 = datasets.load_digits()
#  feature matrix Standardization
features_m = StandardScaler().fit_transform(
# sparse matrix creation
f_sparse = csr_matrix(features_m)
# TSVD creation
tsvd = TruncatedSVD(n_components=12)
# sparse matrix TSVD
features_sp_tsvd =
# results
print("Original number of features:", f_sparse.shape[1])
print("Reduced number of features:", features_sp_tsvd.shape[1])

# Sum of first three components' explained variance ratios