Exercises on Slide 21-22
###### Read the Forbes data online
forbes <- read.csv("http://statweb.lsu.edu/faculty/li/data/Forbes2000.csv")
#1: Find the median profit for the companies in the US, UK, France and Germany
forbes.2=subset(forbes, country %in% c("United Kingdom", "United States","France","Germany"))
forbes.2$country <- forbes.2$country[,drop = TRUE]
tapply(forbes.2$profit,forbes.2$country,median,na.rm=T)
## France Germany United Kingdom United States
## 0.190 0.230 0.205 0.240
#2: Find all German companies with negative profit
forbes$name[which((forbes$profits<0)&(forbes$country=='Germany'))]
## [1] Allianz Worldwide Deutsche Telekom
## [3] E.ON HVB-HypoVereinsbank
## [5] Commerzbank Infineon Technologies
## [7] BHW Holding Bankgesellschaft Berlin
## [9] W&W-Wustenrot mg technologies
## [11] Nurnberger Beteiligungs SPAR Handels
## [13] Mobilcom
## 2000 Levels: Aareal Bank ABB Group Abbey National ... Zurich Financial Services
#3: Find the business category to which most of the Bermuda island companies belong.
BMcomp <- subset(forbes, country == "Bermuda")
table(BMcomp$category)
##
## Aerospace & defense Banking
## 0 1
## Business services & supplies Capital goods
## 0 1
## Chemicals Conglomerates
## 0 2
## Construction Consumer durables
## 0 0
## Diversified financials Drugs & biotechnology
## 0 0
## Food drink & tobacco Food markets
## 1 1
## Health care equipment & services Hotels restaurants & leisure
## 0 0
## Household & personal products Insurance
## 0 10
## Materials Media
## 0 1
## Oil & gas operations Retailing
## 2 0
## Semiconductors Software & services
## 0 1
## Technology hardware & equipment Telecommunications services
## 0 0
## Trading companies Transportation
## 0 0
## Utilities
## 0
#4: For the 50 companies in the Forbes dataset with the highest profit
# plot sales against assets, labelling each point with approriate country
# name which may need to be abbreviated (using abbreviate) to avoid
# makeing the plot look too messy
order_profit <- order(forbes$profit,decreasing=T)[1:50]
plot(forbes$sales[order_profit],forbes$assest[order_profit],
xlab="Sales",ylab="Assests",type="n")
text(forbes$sales[order_profit],forbes$assest[order_profit],
abbreviate(forbes$country[order_profit]),cex=1)

#5: Find the average value of sales for the companies in each country
tapply(forbes$sales, forbes$country, mean, na.rm = TRUE)
## Africa Australia
## 6.820000 5.244595
## Australia/ United Kingdom Austria
## 11.595000 4.142500
## Bahamas Belgium
## 1.350000 10.114444
## Bermuda Brazil
## 6.840500 6.338667
## Canada Cayman Islands
## 6.429643 1.660000
## Chile China
## 1.602500 5.099600
## Czech Republic Denmark
## 1.805000 6.349000
## Finland France
## 10.291818 20.102063
## France/ United Kingdom Germany
## 1.010000 20.781385
## Greece Hong Kong/China
## 2.528333 2.044000
## Hungary India
## 3.370000 3.868148
## Indonesia Ireland
## 2.450000 4.765000
## Islands Israel
## 6.670000 2.060000
## Italy Japan
## 10.213902 10.190633
## Jordan Kong/China
## 1.330000 5.717500
## Korea Liberia
## 15.005000 3.780000
## Luxembourg Malaysia
## 14.185000 1.716250
## Mexico Netherlands
## 3.937647 17.020714
## Netherlands/ United Kingdom New Zealand
## 92.100000 2.640000
## Norway Pakistan
## 10.780000 1.230000
## Panama/ United Kingdom Peru
## 5.930000 0.170000
## Philippines Poland
## 1.565000 4.410000
## Portugal Russia
## 3.884286 7.672500
## Singapore South Africa
## 3.685000 4.124000
## South Korea Spain
## 7.969333 7.843448
## Sweden Switzerland
## 7.665769 12.456765
## Taiwan Thailand
## 2.751429 2.513333
## Turkey United Kingdom
## 4.713333 10.445109
## United Kingdom/ Australia United Kingdom/ Netherlands
## 10.010000 7.540000
## United Kingdom/ South Africa United States
## 2.060000 10.058256
## Venezuela
## 0.980000
#6: Find the number of companies in each country with profits above 5 billion US dollars
comp.5b <- subset(forbes, profits >5)
comp.5b$country <- comp.5b$country[,drop = TRUE] #drop the rest of the country information
table(comp.5b$country)
##
## China France
## 1 1
## Germany Japan
## 1 1
## Netherlands/ United Kingdom South Korea
## 1 1
## Switzerland United Kingdom
## 3 3
## United States
## 20
#7. Fit a logistic regression model on the South African Heart Disease Dataset. The dataset
#is available from the Elements of Statistical Learning (EOSL) webpage:
#http://statweb.stanford.edu/~tibs/ElemStatLearn/
#You can also see the data description on EOSL webpage.
#You can get the data into R by the following code in R
data <- read.table("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",", head=T, row.names=1)
#7.a) Set the 'Present' as 1 and 'Absent' as 0 for variable 'famhist'.
data[,5] <- as.numeric(data[,5]=="Present")
#7.b) There are 462 observations in the dataset. Randomly split the dataset
#into 400 observations as the training set. The rest 62 observations as the
#test set.
set.seed(1)
indx <- sample(1:462,size=400,replace=F)
train.data <- data[indx,]
test.data <- data[-indx,]
#7.c) Then fit a logistic regression using 'famhist' (now become 0 and 1 binary
#variable) as the response and all the other variables as the explanatory variables.
fit <- glm(famhist~.,data=train.data,family="binomial")
#7.d) Make the prediction on the training and test sets. Using the 0.5 as the
#cutoff point to get the misclassification rate on the training and test sets, respectively.
prob.train <- fit$fit
prob.test <- predict(fit,newdata=test.data,type="response")
tab.1 <- table(prob.train>=0.5,train.data$famhist)
tab.1
##
## 0 1
## FALSE 191 81
## TRUE 46 82
1-sum(diag(tab.1))/sum(tab.1)
## [1] 0.3175
tab.2 <- table(prob.test>=0.5,test.data$famhist)
tab.2
##
## 0 1
## FALSE 25 18
## TRUE 8 11
1-sum(diag(tab.2))/sum(tab.2)
## [1] 0.4193548
#7.d) Make the prediction on the training and test sets. Using the 0.5 as the
#cutoff point to get the misclassification rate on the training and test sets, respectively.
library(AUC)
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
auc.score <- auc(roc(prob.test,factor(test.data$famhist)))
auc.score
## [1] 0.6750261
spe <- specificity(prob.test,factor(test.data$famhist))$measure
sen <- sensitivity(prob.test,factor(test.data$famhist))$measure
plot(1-spe,sen,xlab="1-Specificity",ylab="Sensitivity",type="l",col="red",lwd=2)
