-
Notifications
You must be signed in to change notification settings - Fork 0
Linear Regression Cross Validation
Ad = read.csv("Advertising.csv") #Seperate dataset to training set and test set trainRows = runif(nrow(Ad))>0.25 #Choose 75% of data as our training data train = Ad[trainRows,] #Make the training dataset test = Ad[!trainRows,] #Put rest of data into test set
#fit two linear regression models fit1 = lm(Sales~Radio+Newspaper,data=train) summary(fit1)
fit2 = lm(Sales~TV+Radio,data=train) summary(fit2)
#method2 set.seed(1) Ad = read.csv("Advertising.csv") train = sample(1:nrow(Ad),0.75*nrow(Ad))
test = -train
attach(Ad)
fit1 = lm(SalesTV+Newspaper,data=Ad[train,]) #first model we run. Sales predicted using TV and Newspaper
fit2 = lm(SalesTV+Radio,data=Ad[train,]) #second model we run. Sales predicted using Tv and Radio
#test our models on test dataset ##NOTICE: when we do predict on test data set, we need to write "newdata=" rather than "data="
Sales.test.pred1 = predict(fit1,newdata = Ad[test,]) Sales.test.pred2 = predict(fit2,newdata = Ad[test,])
mse1= mean((Sales[test] - Sales.test.pred1)^2) mse2= mean((Sales[test] - Sales.test.pred2)^2)
#mse2 is much lower. seems that the second model is a better representative of the data.
#Generating New Variables can come in handy in various models . As an example, we will generate an interaction term. Ad$TV_Newspaper = TV*Newspaper fit3 = lm(Sales~TV+Newspaper+TV_Newspaper,data=Ad[train,]) summary(fit3)
#Another way of adding the interaction term to your regression: fit4 = lm(Sales~TV*Newspaper,data=Ad[train,]) summary(fit4)
###################################### #Some more variable manipulation college = read.csv("C:\Users\umair\Documents\Columbia\Business Analytics for OR\course_files_export\Recitations\Recitation 2\College.csv") head(college) attach(college) college$Private = ifelse(Private=="Yes",1,0) college$Private
######################################
#To calculate the confidence intervals of the sample coefficients on Tv and Newspaper, use the confint command. The default level is 0.95 confint(fit4)
#To calculate a prediction interval for a particular point, use the predict command. predict(fit4, data.frame(TV = 60, Newspaper = 1300), interval = "prediction", level = 0.95) predict(fit4, data.frame(TV = 60, Newspaper = 1300), interval = "prediction") # gives the same answer. The default level is 0.95 predict(fit4, data.frame(TV = 60, Newspaper = 1300), interval = "prediction", level = 0.99) #As you can see, predictions for individual points are not that precise. This is expected, because individual points can vary considerably.
####################################### #Regression with non linear independent variables Ad = read.csv("C:\Users\umair\Documents\Columbia\Business Analytics for OR\course_files_export\Recitations\Recitation 2\Advertising.csv")
attach(Ad) l1 = lm(Sales~TV) summary(l1)
l2 = lm(Sales~I(TV^2)) #regresses on the square of TV summary(l2)
library('Hmisc') library(dplyr) require(Hmisc)
#Load data eCarData = read.csv("Car_Data--Extract.csv")
#######
#Generate bins for APR eCarData$RateBin = cut2(eCarData$Rate, c(4,4.5,5, 5.5, 6, 6.5,7, 7.5, 8, 8.5))
#Select Outcomes and the bins for APR, group by the APR bins, and calculate the average for each bin. Conversion_Rate_VS_APR= eCarData %>% select(Outcome, RateBin) %>% group_by(RateBin) %>% summarise(Conversion_Rate = mean(Outcome))
#generate bar chart barplot(Conversion_Rate_VS_APR$Conversion_Rate, ylab = "Conversion Rate", xlab= "APR", main = "Conversion Rate vs APR", names.arg = Conversion_Rate_VS_APR$RateBin)
barplot(Conversion_Rate_VS_APR$Conversion_Rate, ylab = "Conversion Rate", xlab= "APR", main = "Conversion Rate vs APR")
####### #page 6/19 first graph
Quotes_by_APR = eCarData %>% mutate(count = 1) %>% select(count, RateBin) %>% group_by(RateBin) %>% summarise(Quotes = sum(count))
barplot(Quotes_by_APR$Quotes, ylab = "Number of Quotes", xlab = "APR", main = "Number of Quotes by APR", names.arg = Quotes_by_APR$RateBin)
#######
college_private = college %>% filter(Private == 1)
####### #ADDENDUM
#Correlation between different variables in a dataset can be calculated using the cor() function
rm(list=ls(all=TRUE)) # removes prior data from R
Ad = read.csv("Advertising.csv") cor(Ad)