library(tidyverse)
library(tidymodels)
library(conflicted)
conflicts_prefer(dplyr::filter())
conflicts_prefer(dplyr::lag())
conflicts_prefer(recipes::step())House Prices
Advanced Regression Techniques
Predict sales prices and practice feature engineering, RFs, and gradient boosting.
Description
Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
Challenge
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, predict the final price of each home.
Set Environment
Read Data
house_prices_train <-
read.csv("./house_prices_train.csv")
house_prices_test <-
read.csv("./house_prices_test.csv")
house_prices_data <- bind_rows(
list(
"train"=house_prices_train,
"test"=house_prices_test
), .id = "From"
)
glimpse(house_prices_data)Rows: 2,919
Columns: 82
$ From <chr> "train", "train", "train", "train", "train", "train", "t…
$ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
$ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
$ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
$ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
$ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
$ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
$ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
$ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
$ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
$ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
$ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
$ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
$ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
$ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
$ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
$ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
$ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
$ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
$ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
$ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
$ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
$ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
$ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
$ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
$ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
$ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
$ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
$ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
$ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
$ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
$ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
$ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
$ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
$ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
$ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
$ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
$ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
$ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
$ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
$ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
$ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
$ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
$ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
$ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
$ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
$ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
Count NA
house_prices_data |> select(-SalePrice) |>
summarise(
across(everything(), \(x) sum(is.na(x)))) |>
pivot_longer(
everything(), names_to="columns", values_to="na_count"
) |> filter(na_count!=0)# A tibble: 34 × 2
columns na_count
<chr> <int>
1 MSZoning 4
2 LotFrontage 486
3 Alley 2721
4 Utilities 2
5 Exterior1st 1
6 Exterior2nd 1
7 MasVnrType 24
8 MasVnrArea 23
9 BsmtQual 81
10 BsmtCond 82
# ℹ 24 more rows
Convert Data
house_prices_data <- house_prices_data |>
mutate(
MSSubClass=as.factor(MSSubClass),
Alley=if_else(is.na(Alley), "None", Alley),
YearBuilt=YrSold - YearBuilt,
YearBuilt=if_else(
YearBuilt < 0, 0, YearBuilt
),
YearRemodAdd=YrSold - YearRemodAdd,
YearRemodAdd=case_when(
YearRemodAdd <= 0 ~ "0",
YearRemodAdd / 10 < 1 ~ "1-10",
YearRemodAdd / 10 < 2 ~ "11-20",
YearRemodAdd / 10 < 3 ~ "21-30",
YearRemodAdd / 10 < 4 ~ "31-40",
YearRemodAdd / 10 < 5 ~ "41-50",
YearRemodAdd / 10 <= 6 ~ "51-60",
),
YearRemodAdd=as.factor(YearRemodAdd),
Exterior2nd=if_else(
Exterior2nd == Exterior1st, "None", Exterior2nd
),
MasVnrType=if_else(
is.na(MasVnrType) & is.na(MasVnrArea),
"None", MasVnrType
),
MasVnrArea=if_else(
is.na(MasVnrArea), 0, MasVnrArea
),
ExterQual=case_match(
ExterQual,
"Ex" ~ 5,
"Gd" ~ 4,
"TA" ~ 3,
"Fa" ~ 2,
"Po" ~ 1
),
ExterCond=case_match(
ExterCond,
"Ex" ~ 5,
"Gd" ~ 4,
"TA" ~ 3,
"Fa" ~ 2,
"Po" ~ 1
),
BsmtQual=case_when(
BsmtQual == "Ex" ~ 5,
BsmtQual == "Gd" ~ 4,
BsmtQual == "TA" ~ 3,
BsmtQual == "Fa" ~ 2,
BsmtQual == "Po" ~ 1,
is.na(BsmtQual) ~ 0,
),
BsmtCond=case_when(
BsmtCond == "Ex" ~ 5,
BsmtCond == "Gd" ~ 4,
BsmtCond == "TA" ~ 3,
BsmtCond == "Fa" ~ 2,
BsmtCond == "Po" ~ 1,
is.na(BsmtCond) ~ 0,
),
BsmtExposure=case_when(
BsmtExposure == "Gd" ~ 4,
BsmtExposure == "Av" ~ 3,
BsmtExposure == "Mn" ~ 2,
BsmtExposure == "No" ~ 1,
is.na(BsmtExposure) ~ 0,
),
BsmtFinType1=case_when(
BsmtFinType1 == "GLQ" ~ 6,
BsmtFinType1 == "ALQ" ~ 5,
BsmtFinType1 == "BLQ" ~ 4,
BsmtFinType1 == "Rec" ~ 3,
BsmtFinType1 == "LwQ" ~ 2,
BsmtFinType1 == "Unf" ~ 1,
is.na(BsmtFinType1) ~ 0,
),
BsmtFinType2=case_when(
BsmtFinType2 == "GLQ" ~ 6,
BsmtFinType2 == "ALQ" ~ 5,
BsmtFinType2 == "BLQ" ~ 4,
BsmtFinType2 == "Rec" ~ 3,
BsmtFinType2 == "LwQ" ~ 2,
BsmtFinType2 == "Unf" ~ 1,
is.na(BsmtFinType2) ~ 0,
),
across(
c(BsmtFinSF1, BsmtFinSF2:TotalBsmtSF),
\(x) x=if_else(is.na(x), 0, x)
),
HeatingQC=case_match(
HeatingQC,
"Ex" ~ 5,
"Gd" ~ 4,
"TA" ~ 3,
"Fa" ~ 2,
"Po" ~ 1
),
CentralAir=case_match(
CentralAir,
"Y" ~ 1,
"N" ~ 0
),
KitchenQual=case_match(
KitchenQual,
"Ex" ~ 5,
"Gd" ~ 4,
"TA" ~ 3,
"Fa" ~ 2,
"Po" ~ 1
),
Functional=if_else(
is.na(Functional), "Typ", Functional
),
FireplaceQu=case_when(
FireplaceQu == "Ex" ~ 5,
FireplaceQu == "Gd" ~ 4,
FireplaceQu == "TA" ~ 3,
FireplaceQu == "Fa" ~ 2,
FireplaceQu == "Po" ~ 1,
is.na(FireplaceQu) ~ 0,
),
GarageYrBlt=YrSold - GarageYrBlt,
GarageYrBlt=case_when(
GarageYrBlt <= 0 ~ "0",
GarageYrBlt / 10 < 1 ~ "1-10",
GarageYrBlt / 10 < 2 ~ "11-20",
GarageYrBlt / 10 < 3 ~ "21-30",
GarageYrBlt / 10 < 4 ~ "31-40",
GarageYrBlt / 10 < 5 ~ "41-50",
GarageYrBlt / 10 < 6 ~ "51-60",
GarageYrBlt / 10 < 7 ~ "61-70",
GarageYrBlt / 10 < 8 ~ "71-80",
GarageYrBlt / 10 < 9 ~ "81-90",
GarageYrBlt / 10 < 10 ~ "91-100",
GarageYrBlt / 10 >= 10 ~ "100+",
),
across(
c(GarageType:GarageFinish),
\(x) x=if_else(is.na(GarageType), "None", x)
),
GarageYrBlt=as.factor(GarageYrBlt) |>
fct_relevel("100+", "None", after=Inf),
GarageQual=case_when(
GarageQual == "Ex" ~ 5,
GarageQual == "Gd" ~ 4,
GarageQual == "TA" ~ 3,
GarageQual == "Fa" ~ 2,
GarageQual == "Po" ~ 1,
is.na(GarageQual) ~ 0
),
GarageCond=case_when(
GarageCond == "Ex" ~ 5,
GarageCond == "Gd" ~ 4,
GarageCond == "TA" ~ 3,
GarageCond == "Fa" ~ 2,
GarageCond == "Po" ~ 1,
is.na(GarageCond) ~ 0
),
PoolQC=case_when(
PoolQC == "Ex" ~ 4,
PoolQC == "Gd" ~ 3,
PoolQC == "TA" ~ 2,
PoolQC == "Fa" ~ 1,
is.na(PoolQC) ~ 0
),
Fence=case_when(
Fence == "GdPrv" ~ 4,
Fence == "MnPrv" ~ 3,
Fence == "GdWo" ~ 2,
Fence == "MnWw" ~ 1,
is.na(Fence) ~ 0
),
MiscFeature=if_else(
MiscVal==0, "None", MiscFeature
),
MiscFeature=if_else(
is.na(MiscFeature), "Gar2", MiscFeature
),
across(
c(MoSold, YrSold),
\(x) x=as.factor(x)
),
across(
where(is.character),
\(x) x=as.factor(x) |> fct_infreq()
),
GarageType=GarageType |>
fct_relevel("None", after=Inf)
)
glimpse(house_prices_data)Rows: 2,919
Columns: 82
$ From <fct> train, train, train, train, train, train, train, train, …
$ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ MSSubClass <fct> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
$ MSZoning <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
$ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
$ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
$ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
$ Alley <fct> None, None, None, None, None, None, None, None, None, No…
$ LotShape <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
$ LandContour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
$ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
$ LotConfig <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
$ LandSlope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
$ Neighborhood <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
$ Condition1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
$ Condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
$ BldgType <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
$ HouseStyle <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
$ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
$ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
$ YearBuilt <dbl> 5, 31, 7, 91, 8, 16, 3, 36, 77, 69, 43, 1, 46, 1, 48, 78…
$ YearRemodAdd <fct> 1-10, 31-40, 1-10, 31-40, 1-10, 11-20, 1-10, 31-40, 51-6…
$ RoofStyle <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
$ RoofMatl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
$ Exterior1st <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
$ Exterior2nd <fct> None, None, None, Wd Shng, None, None, None, None, Wd Sh…
$ MasVnrType <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
$ MasVnrArea <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
$ ExterQual <dbl> 4, 3, 4, 3, 4, 3, 4, 3, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 3,…
$ ExterCond <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ Foundation <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
$ BsmtQual <dbl> 4, 4, 4, 3, 4, 4, 5, 4, 3, 3, 3, 5, 3, 4, 3, 3, 3, 0, 3,…
$ BsmtCond <dbl> 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3,…
$ BsmtExposure <dbl> 1, 4, 2, 1, 3, 1, 3, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 1,…
$ BsmtFinType1 <dbl> 6, 5, 6, 5, 6, 6, 6, 5, 1, 6, 3, 6, 5, 1, 4, 1, 5, 0, 6,…
$ BsmtFinSF1 <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
$ BsmtFinType2 <dbl> 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,…
$ BsmtFinSF2 <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ BsmtUnfSF <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
$ TotalBsmtSF <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
$ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
$ HeatingQC <dbl> 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 3, 5, 3, 5, 5, 3, 5,…
$ CentralAir <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
$ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
$ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
$ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
$ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
$ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
$ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ KitchenQual <dbl> 4, 3, 4, 4, 4, 3, 4, 3, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 4,…
$ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
$ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
$ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
$ FireplaceQu <dbl> 0, 3, 3, 4, 3, 0, 4, 3, 3, 3, 0, 4, 0, 4, 2, 0, 3, 0, 0,…
$ GarageType <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
$ GarageYrBlt <fct> 1-10, 31-40, 1-10, 1-10, 1-10, 11-20, 1-10, 31-40, 71-80…
$ GarageFinish <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
$ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
$ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
$ GarageQual <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ GarageCond <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ PavedDrive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
$ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
$ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolQC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Fence <dbl> 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0,…
$ MiscFeature <fct> None, None, None, None, None, Shed, None, Shed, None, No…
$ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
$ MoSold <fct> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
$ YrSold <fct> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
$ SaleType <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
$ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
$ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
Draw Plot
train_data <- house_prices_data |>
filter(From=="train") |>
select(-From, -Id)
test_data <- house_prices_data |>
filter(From=="test") |>
select(-From, -Id, -SalePrice)
test_id <- house_prices_data |>
filter(From=="test") |>
select(Id)
columns <- test_data |> colnames()
sequences <- c("OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Fireplaces", "FireplaceQu", "GarageCars", "GarageQual", "GarageCond", "PoolQC", "Fence")
for(x in columns){
if(is.factor(train_data[[x]]) | is.element(x, sequences)){
barPlot <- train_data |>
ggplot(aes(x=as.factor(.data[[x]]))) +
geom_bar(
aes(y=after_stat(count)),
color="royalblue", fill="skyblue"
) + xlab(x) + theme_bw()
print(barPlot)
boxPlot <- train_data |>
ggplot(aes(y=SalePrice)) +
geom_boxplot(
aes(x=as.factor(.data[[x]])),
color="royalblue", fill="skyblue"
) + scale_y_continuous(
breaks= seq(0, 800000, by=100000), labels=comma
) + xlab(x) + theme_bw()
print(boxPlot)
}else if(is.numeric(train_data[[x]])){
IQR <- train_data |>
select(x) |>
filter(.data[[x]]!=0) |>
unlist() |> IQR(na.rm=TRUE)
width <- round(IQR/5, 0) + 1
histogramPlot <- train_data |>
ggplot(aes(x=.data[[x]])) +
geom_histogram(
na.rm=TRUE, binwidth=width, center=width/2,
color="royalblue", fill="skyblue"
) + theme_bw()
print(histogramPlot)
pointPlot <- train_data |>
ggplot(aes(y=SalePrice)) +
geom_point(
aes(x=.data[[x]]), na.rm=TRUE, color="royalblue"
) + scale_y_continuous(
breaks= seq(0, 800000, by=100000), labels=comma
) + theme_bw()
print(pointPlot)
}
}





























































































































































train_data |>
ggplot(aes(x=SalePrice)) +
geom_histogram(
na.rm=TRUE, binwidth=10000, center=5000,
color="royalblue", fill="skyblue"
) + scale_x_continuous(
breaks= seq(0, 800000, by=100000), labels=comma
)+ theme_bw()
train_data |>
ggplot(aes(sample=SalePrice)) +
geom_qq(color="slateblue") +
geom_qq_line(color="royalblue", linewidth=1)+
scale_y_continuous(
breaks= seq(0, 800000, by=100000), labels=comma
) + theme_bw()
train_data <- train_data |>
mutate(SalePrice=log(SalePrice, base=10))
train_data |>
ggplot(aes(sample=SalePrice)) +
geom_qq(color="slateblue") +
geom_qq_line(color="royalblue", linewidth=1)+
theme_bw()
Test Correlation
for(x in columns){
if(is.factor(train_data[[x]])){
cat("x=", x, "\n")
kruskal.test(
train_data$SalePrice, train_data[[x]]
) |> print()
}else if(is.element(x, sequences)){
cat("x=", x, "\n")
kruskal.test(
train_data$SalePrice, as.factor(train_data[[x]])
) |> print()
}else if(is.numeric(train_data[[x]])){
cat("x=", x, "\n")
cor.test(
train_data$SalePrice, train_data[[x]],
method="kendall"
) |> print()
}
}x= MSSubClass
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 531.54, df = 14, p-value < 2.2e-16
x= MSZoning
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 270.07, df = 4, p-value < 2.2e-16
x= LotFrontage
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 14.823, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.2903612
x= LotArea
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 17.95, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.3141744
x= Street
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 3.0624, df = 1, p-value = 0.08013
x= Alley
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 45.714, df = 2, p-value = 1.184e-10
x= LotShape
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 150.95, df = 3, p-value < 2.2e-16
x= LandContour
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.31, df = 3, p-value = 1.048e-07
x= Utilities
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 0.40737, df = 1, p-value = 0.5233
x= LotConfig
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.724, df = 4, p-value = 3.298e-07
x= LandSlope
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 3.9388, df = 2, p-value = 0.1395
x= Neighborhood
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 868.53, df = 24, p-value < 2.2e-16
x= Condition1
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 74.404, df = 8, p-value = 6.493e-13
x= Condition2
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 17.774, df = 7, p-value = 0.01303
x= BldgType
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 69.801, df = 4, p-value = 2.501e-14
x= HouseStyle
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 185.43, df = 7, p-value < 2.2e-16
x= OverallQual
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 966.1, df = 9, p-value < 2.2e-16
x= OverallCond
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 238.77, df = 8, p-value < 2.2e-16
x= YearBuilt
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = -26.57, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.4685664
x= YearRemodAdd
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 519.19, df = 6, p-value < 2.2e-16
x= RoofStyle
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 39.878, df = 5, p-value = 1.581e-07
x= RoofMatl
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 18.872, df = 7, p-value = 0.008597
x= Exterior1st
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 304.04, df = 14, p-value < 2.2e-16
x= Exterior2nd
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.231, df = 15, p-value = 0.002279
x= MasVnrType
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 270.81, df = 3, p-value < 2.2e-16
x= MasVnrArea
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 16.404, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.3172199
x= ExterQual
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 683.44, df = 3, p-value < 2.2e-16
x= ExterCond
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 46.617, df = 4, p-value = 1.832e-09
x= Foundation
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 488.72, df = 5, p-value < 2.2e-16
x= BsmtQual
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 674.41, df = 4, p-value < 2.2e-16
x= BsmtCond
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 120.88, df = 4, p-value < 2.2e-16
x= BsmtExposure
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 191.03, df = 4, p-value < 2.2e-16
x= BsmtFinType1
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 348.88, df = 6, p-value < 2.2e-16
x= BsmtFinSF1
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 12.164, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.2208792
x= BsmtFinType2
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 64.589, df = 6, p-value = 5.232e-12
x= BsmtFinSF2
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = -1.476, p-value = 0.14
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.03071045
x= BsmtUnfSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 7.2624, p-value = 3.802e-13
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.1274574
x= TotalBsmtSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 24.837, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.4350183
x= Heating
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 33.267, df = 5, p-value = 3.331e-06
x= HeatingQC
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 356.78, df = 4, p-value < 2.2e-16
x= CentralAir
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 143.2, df = 1, p-value < 2.2e-16
x= Electrical
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 131.42, df = 4, p-value < 2.2e-16
x= X1stFlrSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 23.511, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.4115564
x= X2ndFlrSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 12.105, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.2324971
x= LowQualFinSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = -2.5943, p-value = 0.00948
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.05530811
x= GrLivArea
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 31.079, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.5439421
x= BsmtFullBath
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 73.976, df = 3, p-value = 6.006e-16
x= BsmtHalfBath
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 0.23762, df = 2, p-value = 0.888
x= FullBath
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 599.36, df = 3, p-value < 2.2e-16
x= HalfBath
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 179.32, df = 2, p-value < 2.2e-16
x= BedroomAbvGr
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 101.09, df = 7, p-value < 2.2e-16
x= KitchenAbvGr
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 43.206, df = 3, p-value = 2.225e-09
x= KitchenQual
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 661.48, df = 3, p-value < 2.2e-16
x= TotRmsAbvGrd
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 419.41, df = 11, p-value < 2.2e-16
x= Functional
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 32.56, df = 6, p-value = 1.274e-05
x= Fireplaces
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 406.84, df = 3, p-value < 2.2e-16
x= FireplaceQu
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 461.9, df = 5, p-value < 2.2e-16
x= GarageType
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 532.7, df = 6, p-value < 2.2e-16
x= GarageYrBlt
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 615.92, df = 12, p-value < 2.2e-16
x= GarageFinish
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 609.81, df = 3, p-value < 2.2e-16
x= GarageCars
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 702.51, df = 4, p-value < 2.2e-16
x= GarageArea
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 27.204, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.4781465
x= GarageQual
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 192, df = 5, p-value < 2.2e-16
x= GarageCond
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 190.96, df = 5, p-value < 2.2e-16
x= PavedDrive
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 115.84, df = 2, p-value < 2.2e-16
x= WoodDeckSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 13.684, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.2603486
x= OpenPorchSF
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 18.724, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.350161
x= EnclosedPorch
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = -8.3273, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.1720941
x= X3SsnPorch
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 2.5075, p-value = 0.01216
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.05347771
x= ScreenPorch
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 3.8416, p-value = 0.0001222
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.08065437
x= PoolArea
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = 2.2349, p-value = 0.02542
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.04780012
x= PoolQC
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 5.9957, df = 3, p-value = 0.1118
x= Fence
Kruskal-Wallis rank sum test
data: train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 78.023, df = 4, p-value = 4.567e-16
x= MiscFeature
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 10.654, df = 4, p-value = 0.03075
x= MiscVal
Kendall's rank correlation tau
data: train_data$SalePrice and train_data[[x]]
z = -2.3973, p-value = 0.01652
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.05091716
x= MoSold
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 14.278, df = 11, p-value = 0.218
x= YrSold
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 1.6459, df = 4, p-value = 0.8005
x= SaleType
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 156.38, df = 8, p-value < 2.2e-16
x= SaleCondition
Kruskal-Wallis rank sum test
data: train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 168.32, df = 5, p-value < 2.2e-16
train_data <- train_data |>
select(-Street, -Utilities, -LandSlope, -BsmtFinSF2, -BsmtHalfBath, -PoolQC, -MoSold, -YrSold)
test_data <- test_data |>
select(-Street, -Utilities, -LandSlope, -BsmtFinSF2, -BsmtHalfBath, -PoolQC, -MoSold, -YrSold)Split Data
set.seed(1024)
split_data <- initial_split(train_data)
fold_data <- vfold_cv(train_data)Create Model
model <- boost_tree(
mtry=tune(), trees=1000, min_n=tune(),
tree_depth=10, learn_rate=0.01) |>
set_engine("xgboost", nthread=8, counts=FALSE) |>
set_mode("regression")Create Recipe
recipe <- recipe(SalePrice ~ ., data=split_data) |>
step_impute_knn(
MSZoning, LotFrontage, Electrical,
impute_with=imp_vars(
MSSubClass, Neighborhood
)
) |>
step_impute_knn(
Exterior1st, Exterior2nd, MasVnrType,
impute_with=imp_vars(
MasVnrArea, ExterQual, ExterCond
)
) |>
step_impute_knn(
BsmtQual, BsmtCond, BsmtExposure, BsmtFinType2, BsmtFullBath,
impute_with=imp_vars(
BsmtFinType1, BsmtFinSF1, BsmtUnfSF, TotalBsmtSF
)
) |>
step_impute_knn(
GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond,
impute_with=imp_vars(GarageType)
) |>
step_impute_knn(
KitchenQual,
impute_with=imp_vars(KitchenAbvGr)
) |>
step_impute_knn(
SaleType,
impute_with=imp_vars(SaleCondition)
) |>
step_dummy(
all_nominal_predictors(),
one_hot=TRUE
)Create Workflow
workflow <- workflow() |>
add_recipe(recipe) |>
add_model(model)Tune Model
tune <- workflow |>
tune_bayes(
resamples=fold_data,
param_info=parameters(
mtry=mtry_prop(), min_n()
),
metrics=metric_set(rmse, rsq)
)
tune |> autoplot()
tune |> show_best(metric="rmse")# A tibble: 5 × 9
min_n mtry .metric .estimator mean n std_err .config .iter
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <int>
1 6 0.100 rmse standard 0.0519 10 0.00257 Iter4 4
2 5 0.107 rmse standard 0.0520 10 0.00262 Iter6 6
3 5 0.102 rmse standard 0.0521 10 0.00256 Iter7 7
4 2 0.101 rmse standard 0.0522 10 0.00252 Iter2 2
5 6 0.117 rmse standard 0.0524 10 0.00257 Iter9 9
tune |> show_best(metric="rsq")# A tibble: 5 × 9
min_n mtry .metric .estimator mean n std_err .config .iter
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <int>
1 6 0.100 rsq standard 0.912 10 0.00698 Iter4 4
2 5 0.107 rsq standard 0.912 10 0.00758 Iter6 6
3 2 0.101 rsq standard 0.912 10 0.00735 Iter2 2
4 5 0.102 rsq standard 0.912 10 0.00739 Iter7 7
5 6 0.117 rsq standard 0.910 10 0.00731 Iter9 9
params <- tune |> select_best(metric="rmse")
final <- workflow |> finalize_workflow(params)Validate Model
final |> last_fit(split_data) |> collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.0463 Preprocessor1_Model1
2 rsq standard 0.931 Preprocessor1_Model1
Test Model
fit <- final |> fit(train_data)
predict <- fit |> predict(test_data) |>
rename(SalePrice=.pred) |>
mutate(SalePrice=10^SalePrice |> round(0))
submission <- test_id |>
select(Id) |>
bind_cols(predict)
glimpse(submission)Rows: 1,459
Columns: 2
$ Id <int> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, …
$ SalePrice <dbl> 126833, 158187, 184164, 192850, 188031, 174289, 170760, 1629…
Score
