data_cleaning.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
print(data_frames.head(10))
data_cleaning2.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
print(data_frames.info())
data_cleaning3.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
print(data_frames.head(10))
print(data_frames.info())
data_cleaning4.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Capitalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def capitalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = word.capitalize()
data_frames['street_address'].apply(lambda x: capitalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
print(data_frames['street_address'])
data_cleaning5.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
print(data_frames.head(10))
data_cleaning6.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
# Remove Out-of-Range Data
# create boolean Series for out of range donations
out_of_range = data_frames['donation'] < 0
# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]
print(data_frames.head(10))
data_cleaning7.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
# Remove Out-of-Range Data
# create boolean Series for out of range donations
out_of_range = data_frames['donation'] < 0
# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]
# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
print(data_frames_no_dupes.info())
data_cleaning8.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
# Remove Out-of-Range Data
# create boolean Series for out of range donations
out_of_range = data_frames['donation'] < 0
# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]
# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
print(data_frames_no_missing.head(20))
data_cleaning9.py
import pandas as pd
# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)
# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')
# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')
# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')
# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()
def normalize_words(arr):
for index, word in enumerate(arr):
if index == 0:
pass
else:
arr[index] = normalize(word)
def normalize(word):
if word.lower() == 'st':
word = 'street'
elif word.lower() == 'rd':
word = 'road'
return word.capitalize()
data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')
# Remove Out-of-Range Data
# create boolean Series for out of range donations
out_of_range = data_frames['donation'] < 0
# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]
# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')
# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)
print(data_frames_no_missing.head(20))
data_frames_no_missing.to_csv(r'clean_donations_data.csv', index = False)
helpers.py
import random
def add_donations(rows):
total_donations = len(rows)
donations = []
# create list of random donation values
donations = list_of_donations(total_donations)
# add donations onto main records
count = 0
while count < total_donations:
rows[count].append(donations[count])
count += 1
def create_row_base():
first_name_options = ['Rosemaria', 'Jodi', 'Alvy', 'Blake', 'Ellis', '']
last_name_options = ['Roderick', 'Hesbrook', 'Summerton', 'Rappport', 'Alben', '']
city_options = ['Hialeah', 'Arlington', 'Springfield', 'Carrollton', 'Cambridge', '']
state_options = ['CT', 'NY', 'VA', 'WA', 'AZ', '']
first_name = random.choice(first_name_options)
last_name = random.choice(last_name_options)
street = street_address()
city = random.choice(city_options)
state = random.choice(state_options)
return [
first_name,
last_name,
street,
city,
state
]
def list_of_donations(size):
donations = []
donation_amt = random_dollar_amt()
for i in range(size):
# randomly change donation value
if random.choice([1, 2, 3, 4, 5]) > 1:
donation_amt = random_dollar_amt()
donations.append(donation_amt)
return donations
def random_dollar_amt():
dollars = random.randint(-50, 200)
cents = random.randint(0, 99)
return '${}.{}'.format(dollars, cents)
def scramble_capitalization(str):
final_str = ''
for letter in str:
final_str += random.choice([letter.upper(), letter.lower()])
return final_str
def street_address():
num = random.randint(40,1001)
road_name = random.choice(['Western Plank', 'Forest Run', 'Kings', 'Oaktree'])
road_type = random.choice(['Street', 'St', 'Road', 'Rd', ''])
address = '{} {} {}'.format(num, road_name, road_type)
return address
simulator.py
import csv
import random
import helpers
def generate_dataset():
rows = []
count = 0
# generate list of base records: names data + address data
while count < 20:
row_to_add = helpers.create_row_base()
rows.append(row_to_add)
# randomly add duplicate records
if random.choice([1, 2, 3, 4, 5]) > 2:
rows.append(row_to_add.copy())
# scramble formatting of street address
rows[-1][2] = helpers.scramble_capitalization(rows[-1][2])
count += 1
# add donation amounts to each record
helpers.add_donations(rows)
return rows
with open('simulated_data.csv', 'w') as f:
f_csv = csv.writer(f)
# write headers first
f_csv.writerow(['first_name','last_name','street_address',
'city','state', 'donation'])
f_csv.writerows(generate_dataset())