Code for Data Cleaning with Pandas in Python Tutorial


View on Github

data_cleaning.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

print(data_frames.head(10))

data_cleaning2.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

print(data_frames.info())

data_cleaning3.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')

print(data_frames.head(10))
print(data_frames.info())

data_cleaning4.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Capitalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def capitalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = word.capitalize()

data_frames['street_address'].apply(lambda x: capitalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')

print(data_frames['street_address'])

data_cleaning5.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')

print(data_frames.head(10))

data_cleaning6.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]

print(data_frames.head(10))

data_cleaning7.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')

print(data_frames_no_dupes.info())

data_cleaning8.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')


# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)


print(data_frames_no_missing.head(20))

data_cleaning9.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')


# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)


print(data_frames_no_missing.head(20))


data_frames_no_missing.to_csv(r'clean_donations_data.csv', index = False)

helpers.py

import random

def add_donations(rows):
    total_donations = len(rows)
    donations = []

    # create list of random donation values 
    donations = list_of_donations(total_donations)

    # add donations onto main records
    count = 0
    while count < total_donations:
        rows[count].append(donations[count])
        count += 1


def create_row_base():
    first_name_options = ['Rosemaria', 'Jodi', 'Alvy', 'Blake', 'Ellis', '']
    last_name_options = ['Roderick', 'Hesbrook', 'Summerton', 'Rappport', 'Alben', '']
    city_options = ['Hialeah', 'Arlington', 'Springfield', 'Carrollton', 'Cambridge', '']
    state_options = ['CT', 'NY', 'VA', 'WA', 'AZ', '']

    first_name = random.choice(first_name_options)
    last_name = random.choice(last_name_options)
    street =  street_address()
    city = random.choice(city_options)
    state = random.choice(state_options)
    
    return [
        first_name, 
        last_name, 
        street, 
        city, 
        state
    ]


def list_of_donations(size):
    donations = []

    donation_amt = random_dollar_amt()
    for i in range(size):
        # randomly change donation value
        if random.choice([1, 2, 3, 4, 5]) > 1:
            donation_amt = random_dollar_amt()
        donations.append(donation_amt)

    return donations


def random_dollar_amt():
    dollars = random.randint(-50, 200)
    cents = random.randint(0, 99)
    return '${}.{}'.format(dollars, cents)


def scramble_capitalization(str):
    final_str = ''
    for letter in str:
        final_str += random.choice([letter.upper(), letter.lower()])
    return final_str


def street_address():
    num = random.randint(40,1001)
    road_name = random.choice(['Western Plank', 'Forest Run', 'Kings', 'Oaktree'])
    road_type = random.choice(['Street', 'St', 'Road', 'Rd', ''])

    address = '{} {} {}'.format(num, road_name, road_type)   
    return address

simulator.py

import csv
import random
import helpers

def generate_dataset():
    rows = []
    count = 0

    # generate list of base records: names data + address data
    while count < 20:
        row_to_add = helpers.create_row_base()
        rows.append(row_to_add)

        # randomly add duplicate records 
        if random.choice([1, 2, 3, 4, 5]) > 2:
            rows.append(row_to_add.copy())
            # scramble formatting of street address
            rows[-1][2] = helpers.scramble_capitalization(rows[-1][2])
        count += 1

    # add donation amounts to each record
    helpers.add_donations(rows)

    return rows


with open('simulated_data.csv', 'w') as f:
    f_csv = csv.writer(f)
    
    # write headers first
    f_csv.writerow(['first_name','last_name','street_address',
        'city','state', 'donation'])
    f_csv.writerows(generate_dataset())