Tuesday, 6 June 2017

Data Analysis and Reporting

Data Analysis:

install pandas module

pip install pandas

after installing importing the pandas module

import pandas

Help on pandas:

import pandas
help(pandas.DataFrame)



Example:

from pandas import DataFrame, read_csv
import pandas as pd
import matplotlib.pyplot as plt
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet = list(zip(names,births))
print(BabyDataSet)
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df)
df.to_csv('births1880.csv',index=False,header=False)
df = pd.read_csv('births1880.csv')
print(df)
df = pd.read_csv('births1880.csv', header=None)
print(df)
df = pd.read_csv('births1880.csv', names=['Names','Births'])
print(df)
Sorted = df.sort_values(['Births'], ascending=False)
print(Sorted.head(1))
print(df['Births'].max())
print(df['Names'])

print(df['Births'])

Example2:

import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import os
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
names = ['Bob','Jessica','Mary','John','Mel']
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]
# Print first 10 records
print(random_names[:10])
births = [random.randint(low=0,high=1000) for i in range(1000)]
print(births[:10])
BabyDataSet = list(zip(random_names,births))
print(BabyDataSet[:10])
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df[:10])
df.to_csv('births1880.txt',index=False,header=False)
df = pd.read_csv('births1880.txt')
print(df.info())
df = pd.read_csv('births1880.txt', header=None)
print(df.info())
print(df.tail())
df = pd.read_csv('births1880.txt', names=['Names','Births'])
print(df.head(5))
print(df['Names'].unique())
for x in df['Names'].unique():
    print(x)
print(df['Names'].describe())
name = df.groupby('Names')

# Apply the sum function to the groupby object
df = name.sum()
print(df)
Sorted = df.sort_values(['Births'], ascending=False)

print(Sorted.head(1))

Example3:

import pandas as pd
# Our small data set
d = {'one':[1,1,1,1,1],
     'two':[2,2,2,2,2],
     'letter':['a','a','b','b','c']}
# Create dataframe
df = pd.DataFrame(d)
print(df)
one = df.groupby('letter')
# Apply sum function
print(one.sum())
letterone = df.groupby(['letter','one']).sum()

print(letterone)

Example4:

import pandas as pd
import sys
# Create a dataframe with dates as your index
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL'] 
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States

# Create a second dataframe
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
# Combine dataframes
df = pd.concat([df1,df2])
print(df)
newdf = df.copy()

newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()  
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std()
print(newdf)



Data science/data reporting:

import matplotlib.pyplot as plt
plt.plot([1,2,3],[5,7,4])
plt.show( )

Example2:

import matplotlib.pyplot as plt

x = [1,2,3]
y = [5,7,4]

x2 = [1,2,3]
y2 = [10,14,12]

plt.plot(x, y, label='First Line')
plt.plot(x2, y2, label='Second Line')
plt.xlabel('Plot Number')
plt.ylabel('Important var')
plt.title('Interesting Graph\n Check it out')
plt.legend( )
plt.show( )



Example3:

import matplotlib.pyplot as plt
plt.bar([1,3,5,7,9],[5,2,7,8,2], label="Example one")

plt.bar([2,4,6,8,10],[8,6,2,5,6], label="Example two", color='g')
plt.legend()
plt.xlabel('bar number')
plt.ylabel('bar height')

plt.title('Epic Graph\n Another Line! Whoa')

plt.show( )

Example4:

import matplotlib.pyplot as plt

population_ages = [22,55,62,45,21,22,34,42,42,4,99,102,110,120,121,122,130,111,115,112,80,75,65,54,44,43,42,48]

bins = [0,10,20,30,40,50,60,70,80,90,100,110,120,130]

plt.hist(population_ages, bins, histtype='bar', rwidth=0.8)

plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )

Example5:

import matplotlib.pyplot as plt

x = [1,2,3,4,5,6,7,8]
y = [5,2,4,2,1,4,5,2]

plt.scatter(x,y, label='skitscat', color='k', s=25, marker="o")

plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )

Example6:

import matplotlib.pyplot as plt

days = [1,2,3,4,5]

sleeping = [7,8,6,11,7]
eating =   [2,3,4,3,2]
working =  [7,8,7,2,2]
playing =  [8,5,7,8,13]
plt.stackplot(days, sleeping,eating,working,playing, colors=['m','c','r','k'])

plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')

plt.show( )

example:

import matplotlib.pyplot as plt

days = [1,2,3,4,5]

sleeping = [7,8,6,11,7]
eating =   [2,3,4,3,2]
working =  [7,8,7,2,2]
playing =  [8,5,7,8,13]


plt.plot([],[],color='m', label='Sleeping', linewidth=5)
plt.plot([],[],color='c', label='Eating', linewidth=5)
plt.plot([],[],color='r', label='Working', linewidth=5)
plt.plot([],[],color='k', label='Playing', linewidth=5)

plt.stackplot(days, sleeping,eating,working,playing, colors=['m','c','r','k'])

plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )

example8:

import matplotlib.pyplot as plt

slices = [7,2,2,13]
activities = ['sleeping','eating','working','playing']
cols = ['c','m','r','b']

plt.pie(slices,
        labels=activities,
        colors=cols,
        startangle=90,
        shadow= True,
        explode=(0,0.1,0,0),
        autopct='%1.1f%%')

plt.title('Interesting Graph\n Check it out')
plt.show( )




No comments:

Post a Comment