Mar-12-2021, 12:58 PM
I am currently using the 'train.csv' file found here: https://www.kaggle.com/c/house-prices-ad...=train.csv
after scatterplotting the features 'YearBuilt' vs 'SalePrice', I am removing the csv rows containing outliers showing in thegraph. My code removes these outliers the first time I use the 'drop' command, but after looking at the graph without the firstly detected outliers more outliers appear, but once I try to remove them, they still appear on the scatterplot, even though I can definitely see that the rows are indeed being erased. I cannot understand why it does that and what should I do to fix it.
after scatterplotting the features 'YearBuilt' vs 'SalePrice', I am removing the csv rows containing outliers showing in thegraph. My code removes these outliers the first time I use the 'drop' command, but after looking at the graph without the firstly detected outliers more outliers appear, but once I try to remove them, they still appear on the scatterplot, even though I can definitely see that the rows are indeed being erased. I cannot understand why it does that and what should I do to fix it.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
from scipy.stats import pearsonr
train_data=pd.read_csv(r'train.csv')
corr_YearBuilt0= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt0, 'corr_YearBuilt0')
sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show()
print(train_data.shape,'+++++++++shape')
outliers_year_built= train_data['SalePrice'].between(500000, 800000, inclusive=False) &
train_data['YearBuilt'].between(1980, 2020, inclusive=False)
"""PRINT AND DROP ROWS CONTAINING OUTLIERS"""
if outliers_year_built.any():
print(train_data[outliers_year_built],'outliars')
print (train_data[outliers_year_built].index)
print(train_data[outliers_year_built].index.values.tolist(),'----------location outliers------')
train_data.drop(train_data.index[train_data[outliers_year_built].index.values.tolist()],
inplace=True)
else:
print('no outliers')
sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show()
corr_YearBuilt= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt, 'corr_YearBuilt')
print(train_data.shape,'+++++++++shape')
"""DROP ROWS CONTAINING ADDITIONAL OUTLIERS"""
outliers_year_built_2= train_data['SalePrice'].between(200000, 500000, inclusive=False) &
train_data['YearBuilt'].between(1860, 1920, inclusive=False)
if outliers_year_built_2.any():
train_data.drop(train_data.index[train_data[outliers_year_built_2].index.values.tolist()],
inplace=True) #THIS ROW DOES NOT SEEM TO WORK AS THE OUTLIERS KEEP APPEARING IN THE FOLLOWING
#SCATTERPLOT
else:
print('no outliers')
sns.scatterplot(x=train_data['YearBuilt'],y= train_data['SalePrice'])
m, b = np.polyfit(train_data['YearBuilt'], train_data['SalePrice'], 1)
sns.regplot(x=train_data['YearBuilt'], y=train_data['SalePrice'])
plt.show() #THE OUTLIERS I TRIED TO REMOVE RIGHT ABOVE ARE STILL AROUND
corr_YearBuilt_2= pearsonr(train_data['YearBuilt'], train_data['SalePrice'])
print(corr_YearBuilt_2, 'corr_YearBuilt')
print(train_data.shape,'+++++++++shape')
