Data Analysis:
install pandas module
pip install pandas
after installing importing the pandas module
import pandas
Help on pandas:
import pandas
help(pandas.DataFrame)
Example:
from pandas import DataFrame, read_csv
import pandas as pd
import matplotlib.pyplot as plt
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet = list(zip(names,births))
print(BabyDataSet)
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df)
df.to_csv('births1880.csv',index=False,header=False)
df = pd.read_csv('births1880.csv')
print(df)
df = pd.read_csv('births1880.csv', header=None)
print(df)
df = pd.read_csv('births1880.csv', names=['Names','Births'])
print(df)
Sorted = df.sort_values(['Births'], ascending=False)
print(Sorted.head(1))
print(df['Births'].max())
print(df['Names'])
print(df['Births'])
Example2:
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import os
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
names = ['Bob','Jessica','Mary','John','Mel']
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]
# Print first 10 records
print(random_names[:10])
births = [random.randint(low=0,high=1000) for i in range(1000)]
print(births[:10])
BabyDataSet = list(zip(random_names,births))
print(BabyDataSet[:10])
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df[:10])
df.to_csv('births1880.txt',index=False,header=False)
df = pd.read_csv('births1880.txt')
print(df.info())
df = pd.read_csv('births1880.txt', header=None)
print(df.info())
print(df.tail())
df = pd.read_csv('births1880.txt', names=['Names','Births'])
print(df.head(5))
print(df['Names'].unique())
for x in df['Names'].unique():
print(x)
print(df['Names'].describe())
name = df.groupby('Names')
# Apply the sum function to the groupby object
df = name.sum()
print(df)
Sorted = df.sort_values(['Births'], ascending=False)
print(Sorted.head(1))
Example3:
import pandas as pd
# Our small data set
d = {'one':[1,1,1,1,1],
'two':[2,2,2,2,2],
'letter':['a','a','b','b','c']}
# Create dataframe
df = pd.DataFrame(d)
print(df)
one = df.groupby('letter')
# Apply sum function
print(one.sum())
letterone = df.groupby(['letter','one']).sum()
print(letterone)
Example4:
import pandas as pd
import sys
# Create a dataframe with dates as your index
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL']
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States
# Create a second dataframe
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
# Combine dataframes
df = pd.concat([df1,df2])
print(df)
newdf = df.copy()
newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std()
print(newdf)
Data science/data reporting:
import matplotlib.pyplot as plt
plt.plot([1,2,3],[5,7,4])
plt.show( )
Example2:
import matplotlib.pyplot as plt
x = [1,2,3]
y = [5,7,4]
x2 = [1,2,3]
y2 = [10,14,12]
plt.plot(x, y, label='First Line')
plt.plot(x2, y2, label='Second Line')
plt.xlabel('Plot Number')
plt.ylabel('Important var')
plt.title('Interesting Graph\n Check it out')
plt.legend( )
plt.show( )
Example3:
import matplotlib.pyplot as plt
plt.bar([1,3,5,7,9],[5,2,7,8,2], label="Example one")
plt.bar([2,4,6,8,10],[8,6,2,5,6], label="Example two", color='g')
plt.legend()
plt.xlabel('bar number')
plt.ylabel('bar height')
plt.title('Epic Graph\n Another Line! Whoa')
plt.show( )
Example6:
import matplotlib.pyplot as plt
days = [1,2,3,4,5]
sleeping = [7,8,6,11,7]
eating = [2,3,4,3,2]
working = [7,8,7,2,2]
playing = [8,5,7,8,13]
plt.stackplot(days, sleeping,eating,working,playing, colors=['m','c','r','k'])
plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.show( )
install pandas module
pip install pandas
after installing importing the pandas module
import pandas
Help on pandas:
import pandas
help(pandas.DataFrame)
Example:
from pandas import DataFrame, read_csv
import pandas as pd
import matplotlib.pyplot as plt
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet = list(zip(names,births))
print(BabyDataSet)
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df)
df.to_csv('births1880.csv',index=False,header=False)
df = pd.read_csv('births1880.csv')
print(df)
df = pd.read_csv('births1880.csv', header=None)
print(df)
df = pd.read_csv('births1880.csv', names=['Names','Births'])
print(df)
Sorted = df.sort_values(['Births'], ascending=False)
print(Sorted.head(1))
print(df['Births'].max())
print(df['Names'])
print(df['Births'])
Example2:
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import os
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
names = ['Bob','Jessica','Mary','John','Mel']
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]
# Print first 10 records
print(random_names[:10])
births = [random.randint(low=0,high=1000) for i in range(1000)]
print(births[:10])
BabyDataSet = list(zip(random_names,births))
print(BabyDataSet[:10])
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
print(df[:10])
df.to_csv('births1880.txt',index=False,header=False)
df = pd.read_csv('births1880.txt')
print(df.info())
df = pd.read_csv('births1880.txt', header=None)
print(df.info())
print(df.tail())
df = pd.read_csv('births1880.txt', names=['Names','Births'])
print(df.head(5))
print(df['Names'].unique())
for x in df['Names'].unique():
print(x)
print(df['Names'].describe())
name = df.groupby('Names')
# Apply the sum function to the groupby object
df = name.sum()
print(df)
Sorted = df.sort_values(['Births'], ascending=False)
print(Sorted.head(1))
Example3:
import pandas as pd
# Our small data set
d = {'one':[1,1,1,1,1],
'two':[2,2,2,2,2],
'letter':['a','a','b','b','c']}
# Create dataframe
df = pd.DataFrame(d)
print(df)
one = df.groupby('letter')
# Apply sum function
print(one.sum())
letterone = df.groupby(['letter','one']).sum()
print(letterone)
Example4:
import pandas as pd
import sys
# Create a dataframe with dates as your index
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL']
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States
# Create a second dataframe
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
# Combine dataframes
df = pd.concat([df1,df2])
print(df)
newdf = df.copy()
newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std()
print(newdf)
Data science/data reporting:
import matplotlib.pyplot as plt
plt.plot([1,2,3],[5,7,4])
plt.show( )
Example2:
import matplotlib.pyplot as plt
x = [1,2,3]
y = [5,7,4]
x2 = [1,2,3]
y2 = [10,14,12]
plt.plot(x, y, label='First Line')
plt.plot(x2, y2, label='Second Line')
plt.xlabel('Plot Number')
plt.ylabel('Important var')
plt.title('Interesting Graph\n Check it out')
plt.legend( )
plt.show( )
Example3:
import matplotlib.pyplot as plt
plt.bar([1,3,5,7,9],[5,2,7,8,2], label="Example one")
plt.bar([2,4,6,8,10],[8,6,2,5,6], label="Example two", color='g')
plt.legend()
plt.xlabel('bar number')
plt.ylabel('bar height')
plt.title('Epic Graph\n Another Line! Whoa')
plt.show( )
Example4:
import matplotlib.pyplot as plt
population_ages = [22,55,62,45,21,22,34,42,42,4,99,102,110,120,121,122,130,111,115,112,80,75,65,54,44,43,42,48]
bins = [0,10,20,30,40,50,60,70,80,90,100,110,120,130]
plt.hist(population_ages, bins, histtype='bar', rwidth=0.8)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )
Example5:
import matplotlib.pyplot as plt
x = [1,2,3,4,5,6,7,8]
y = [5,2,4,2,1,4,5,2]
plt.scatter(x,y, label='skitscat', color='k', s=25, marker="o")
plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )
import matplotlib.pyplot as plt
days = [1,2,3,4,5]
sleeping = [7,8,6,11,7]
eating = [2,3,4,3,2]
working = [7,8,7,2,2]
playing = [8,5,7,8,13]
plt.stackplot(days, sleeping,eating,working,playing, colors=['m','c','r','k'])
plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.show( )
example:
import matplotlib.pyplot as plt
days = [1,2,3,4,5]
sleeping = [7,8,6,11,7]
eating = [2,3,4,3,2]
working = [7,8,7,2,2]
playing = [8,5,7,8,13]
plt.plot([],[],color='m', label='Sleeping', linewidth=5)
plt.plot([],[],color='c', label='Eating', linewidth=5)
plt.plot([],[],color='r', label='Working', linewidth=5)
plt.plot([],[],color='k', label='Playing', linewidth=5)
plt.stackplot(days, sleeping,eating,working,playing, colors=['m','c','r','k'])
plt.xlabel('x')
plt.ylabel('y')
plt.title('Interesting Graph\nCheck it out')
plt.legend( )
plt.show( )
example8:
import matplotlib.pyplot as plt
slices = [7,2,2,13]
activities = ['sleeping','eating','working','playing']
cols = ['c','m','r','b']
plt.pie(slices,
labels=activities,
colors=cols,
startangle=90,
shadow= True,
explode=(0,0.1,0,0),
autopct='%1.1f%%')
plt.title('Interesting Graph\n Check it out')
plt.show( )
No comments:
Post a Comment