Demonstrate the following data preprocessing tasks using python libraries.
a) Dealing with categorical data
b) Scaling the features
c) Splitting dataset into Training and Testing Sets
import pandas as pd
[2]: dataset=pd.read_csv("sample.csv")
[3]: dataset
[3]: Index nation purchase age salary
0 0 India No 25.0 35000
1 1 Russia Yes 27.0 40000
2 2 Germany No 50.0 60000
3 3 Russia No 35.0 40000
4 4 Germany Yes 40.0 50000
5 5 India Yes 35.0 40000
6 6 Russia No 39.1 20000
7 7 India Yes 40.0 40000
8 8 Germany No 50.0 30000
9 9 India Yes 37.0 40000
10 10 Germany No 21.0 70000
11 11 India Yes 39.1 80000
12 12 Russia No 63.0 40000
[4]: from sklearn.preprocessing import LabelBinarizer
[7]: label_binzr =LabelBinarizer()
[8]: label_binzr_output = label_binzr.fit_transform(dataset['nation'])
[9]: result = pd.DataFrame(label_binzr_output,columns=label_binzr.classes_)
[10]: print(result)
Germany India Russia
0 0 1 0
1 0 0 1
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
1
6 0 0 1
7 0 1 0
8 1 0 0
9 0 1 0
10 1 0 0
11 0 1 0
12 0 0 1
[11]: type(result)
[11]: pandas.core.frame.DataFrame
[14]: type(label_binzr_output)
[14]: numpy.ndarray
[15]: from sklearn.model_selection import train_test_split
[18]: X=dataset.iloc[:,[0,3,4]]
[19]: X
[19]: Index age salary
0 0 25.0 35000
1 1 27.0 40000
2 2 50.0 60000
3 3 35.0 40000
4 4 40.0 50000
5 5 35.0 40000
6 6 39.1 20000
7 7 40.0 40000
8 8 50.0 30000
9 9 37.0 40000
10 10 21.0 70000
11 11 39.1 80000
12 12 63.0 40000
[22]: Y=dataset.iloc[:,1].values
[23]: Y
[23]: array(['India', 'Russia', 'Germany', 'Russia', 'Germany', 'India',
'Russia', 'India', 'Germany', 'India', 'Germany', 'India',
'Russia'], dtype=object)
[33]: X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.
↪5,random_state=0)
2
[34]: X_train
[34]: Index age salary
7 7 40.0 40000
9 9 37.0 40000
3 3 35.0 40000
0 0 25.0 35000
5 5 35.0 40000
12 12 63.0 40000
[35]: type(X_train)
[35]: pandas.core.frame.DataFrame
[36]: Y_train
[36]: array(['India', 'India', 'Russia', 'India', 'India', 'Russia'],
dtype=object)
[37]: type(Y_train)
[37]: numpy.ndarray
[38]: from sklearn.preprocessing import StandardScaler
[39]: sc_X = StandardScaler ()
[40]: X_train2 = sc_X.fit_transform(X_train)
[41]: X_train2
[41]: array([[ 0.25537696, 0.07177362, 0.4472136 ],
[ 0.76613088, -0.18661142, 0.4472136 ],
[-0.76613088, -0.35886811, 0.4472136 ],
[-1.53226176, -1.22015156, -2.23606798],
[-0.25537696, -0.35886811, 0.4472136 ],
[ 1.53226176, 2.05272557, 0.4472136 ]])