Loading the dataset
b) Identifying the dependent and independent variables
c) Dealing with missing data
import pandas as pd
[5]: dataset=pd.read_csv("Salary_Data.csv")
dataset
[6]: YearsExperience Salary
0 1.1 39343.0
1 1.3 46205.0
2 1.5 37731.0
3 2.0 43525.0
4 2.2 39891.0
5 2.9 NaN
6 3.0 60150.0
7 3.2 54445.0
8 3.2 64445.0
9 3.7 57189.0
10 3.9 63218.0
11 NaN 55794.0
12 4.0 56957.0
13 4.1 57081.0
14 4.5 61111.0
15 4.9 67938.0
16 5.1 66029.0
17 5.3 83088.0
18 5.9 81363.0
19 6.0 93940.0
20 6.8 91738.0
21 7.1 NaN
22 7.9 101302.0
23 8.2 113812.0
24 8.7 109431.0
25 9.0 NaN
1
26 9.5 116969.0
27 9.6 112635.0
28 NaN 122391.0
29 10.5 121872.0
type(dataset )
pandas.core.frame.DataFrame
dataset.shape
(30,2)
[7]: X=dataset.iloc[:,:-1].values
type(x)
numpy.ndarray
[8]: X
[8]: array([[ 1.1],
[ 1.3],
[ 1.5],
[ 2. ],
[ 2.2],
[ 2.9],
[ 3. ],
[ 3.2],
[ 3.2],
[ 3.7],
[ 3.9],
[ nan],
[ 4. ],
[ 4.1],
[ 4.5],
[ 4.9],
[ 5.1],
[ 5.3],
[ 5.9],
[ 6. ],
[ 6.8],
[ 7.1],
[ 7.9],
[ 8.2],
[ 8.7],
[ 9. ],
[ 9.5],
[ 9.6],
[ nan],
[10.5]])
[9]: Y=dataset.iloc[:,-1]
[10]: Y
[10]: 0 39343.0
1 46205.0
2 37731.0
2
3 43525.0
4 39891.0
5 NaN
6 60150.0
7 54445.0
8 64445.0
9 57189.0
10 63218.0
11 55794.0
12 56957.0
13 57081.0
14 61111.0
15 67938.0
16 66029.0
17 83088.0
18 81363.0
19 93940.0
20 91738.0
21 NaN
22 101302.0
23 113812.0
24 109431.0
25 NaN
26 116969.0
27 112635.0
28 122391.0
29 121872.0
Name: Salary, dtype: float64
[11]: Y=dataset.iloc[:,-1].values
[12]: Y
[12]: array([ 39343., 46205., 37731., 43525., 39891., nan, 60150.,
54445., 64445., 57189., 63218., 55794., 56957., 57081.,
61111., 67938., 66029., 83088., 81363., 93940., 91738.,
nan, 101302., 113812., 109431., nan, 116969., 112635.,
122391., 121872.])
[13]: import numpy as np
[22]: from sklearn.impute import SimpleImputer
[23]: type(X)
[23]: numpy.ndarray
3
[32]: X.shape
[32]: (30, 1)
[36]: imp=SimpleImputer(missing_values=np.nan,strategy="mean")
[38]: X1=imp.fit_transform(X)
[39]: Y.shape
[39]: (30,)
[40]: Y=Y.reshape((-1,1))
[41]: Y.shape
[41]: (30, 1)
[42]: Y1=imp.fit_transform(Y)
[43]: Y=Y.reshape((-1))
Y
array([ 39343., 46205., 37731., 43525., 39891., nan, 60150.,
54445., 64445., 57189., 63218., 55794., 56957., 57081.,
61111., 67938., 66029., 83088., 81363., 93940., 91738.,
nan, 101302., 113812., 109431., nan, 116969., 112635.,
122391., 121872.])
4