Homework II

Exercises on Slide 21-22

###### Read the Forbes data online
forbes <- read.csv("http://statweb.lsu.edu/faculty/li/data/Forbes2000.csv")

#1: Find the median profit for the companies in the US, UK, France and Germany
forbes.2=subset(forbes, country %in% c("United Kingdom", "United States","France","Germany"))
forbes.2$country <- forbes.2$country[,drop = TRUE] 
tapply(forbes.2$profit,forbes.2$country,median,na.rm=T)

##         France        Germany United Kingdom  United States 
##          0.190          0.230          0.205          0.240

#2: Find all German companies with negative profit
forbes$name[which((forbes$profits<0)&(forbes$country=='Germany'))]

##  [1] Allianz Worldwide       Deutsche Telekom       
##  [3] E.ON                    HVB-HypoVereinsbank    
##  [5] Commerzbank             Infineon Technologies  
##  [7] BHW Holding             Bankgesellschaft Berlin
##  [9] W&W-Wustenrot           mg technologies        
## [11] Nurnberger Beteiligungs SPAR Handels           
## [13] Mobilcom               
## 2000 Levels: Aareal Bank ABB Group Abbey National ... Zurich Financial Services

#3: Find the business category to which most of the Bermuda island companies belong.
BMcomp <- subset(forbes, country == "Bermuda")
table(BMcomp$category)

## 
##              Aerospace & defense                          Banking 
##                                0                                1 
##     Business services & supplies                    Capital goods 
##                                0                                1 
##                        Chemicals                    Conglomerates 
##                                0                                2 
##                     Construction                Consumer durables 
##                                0                                0 
##           Diversified financials            Drugs & biotechnology 
##                                0                                0 
##             Food drink & tobacco                     Food markets 
##                                1                                1 
## Health care equipment & services     Hotels restaurants & leisure 
##                                0                                0 
##    Household & personal products                        Insurance 
##                                0                               10 
##                        Materials                            Media 
##                                0                                1 
##             Oil & gas operations                        Retailing 
##                                2                                0 
##                   Semiconductors              Software & services 
##                                0                                1 
##  Technology hardware & equipment      Telecommunications services 
##                                0                                0 
##                Trading companies                   Transportation 
##                                0                                0 
##                        Utilities 
##                                0

#4: For the 50 companies in the Forbes dataset with the highest profit
#   plot sales against assets, labelling each point with approriate country 
#   name which may need to be abbreviated (using abbreviate) to avoid 
#   makeing the plot look too messy
order_profit <- order(forbes$profit,decreasing=T)[1:50]
plot(forbes$sales[order_profit],forbes$assest[order_profit],
     xlab="Sales",ylab="Assests",type="n")
text(forbes$sales[order_profit],forbes$assest[order_profit],
     abbreviate(forbes$country[order_profit]),cex=1)

#5: Find the average value of sales for the companies in each country
tapply(forbes$sales, forbes$country, mean, na.rm = TRUE)

##                       Africa                    Australia 
##                     6.820000                     5.244595 
##    Australia/ United Kingdom                      Austria 
##                    11.595000                     4.142500 
##                      Bahamas                      Belgium 
##                     1.350000                    10.114444 
##                      Bermuda                       Brazil 
##                     6.840500                     6.338667 
##                       Canada               Cayman Islands 
##                     6.429643                     1.660000 
##                        Chile                        China 
##                     1.602500                     5.099600 
##               Czech Republic                      Denmark 
##                     1.805000                     6.349000 
##                      Finland                       France 
##                    10.291818                    20.102063 
##       France/ United Kingdom                      Germany 
##                     1.010000                    20.781385 
##                       Greece              Hong Kong/China 
##                     2.528333                     2.044000 
##                      Hungary                        India 
##                     3.370000                     3.868148 
##                    Indonesia                      Ireland 
##                     2.450000                     4.765000 
##                      Islands                       Israel 
##                     6.670000                     2.060000 
##                        Italy                        Japan 
##                    10.213902                    10.190633 
##                       Jordan                   Kong/China 
##                     1.330000                     5.717500 
##                        Korea                      Liberia 
##                    15.005000                     3.780000 
##                   Luxembourg                     Malaysia 
##                    14.185000                     1.716250 
##                       Mexico                  Netherlands 
##                     3.937647                    17.020714 
##  Netherlands/ United Kingdom                  New Zealand 
##                    92.100000                     2.640000 
##                       Norway                     Pakistan 
##                    10.780000                     1.230000 
##       Panama/ United Kingdom                         Peru 
##                     5.930000                     0.170000 
##                  Philippines                       Poland 
##                     1.565000                     4.410000 
##                     Portugal                       Russia 
##                     3.884286                     7.672500 
##                    Singapore                 South Africa 
##                     3.685000                     4.124000 
##                  South Korea                        Spain 
##                     7.969333                     7.843448 
##                       Sweden                  Switzerland 
##                     7.665769                    12.456765 
##                       Taiwan                     Thailand 
##                     2.751429                     2.513333 
##                       Turkey               United Kingdom 
##                     4.713333                    10.445109 
##    United Kingdom/ Australia  United Kingdom/ Netherlands 
##                    10.010000                     7.540000 
## United Kingdom/ South Africa                United States 
##                     2.060000                    10.058256 
##                    Venezuela 
##                     0.980000

#6: Find the number of companies in each country with profits above 5 billion US dollars
comp.5b <- subset(forbes, profits >5)
comp.5b$country <- comp.5b$country[,drop = TRUE] #drop the rest of the country information
table(comp.5b$country)

## 
##                       China                      France 
##                           1                           1 
##                     Germany                       Japan 
##                           1                           1 
## Netherlands/ United Kingdom                 South Korea 
##                           1                           1 
##                 Switzerland              United Kingdom 
##                           3                           3 
##               United States 
##                          20

#7. Fit a logistic regression model on the South African Heart Disease Dataset. The dataset
#is available from the Elements of Statistical Learning (EOSL) webpage: 
#http://statweb.stanford.edu/~tibs/ElemStatLearn/
#You can also see the data description on EOSL webpage. 
#You can get the data into R by the following code in R

data <- read.table("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",", head=T, row.names=1)

#7.a) Set the 'Present' as 1 and 'Absent' as 0 for variable 'famhist'.
data[,5] <- as.numeric(data[,5]=="Present")

#7.b) There are 462 observations in the dataset. Randomly split the dataset 
#into 400 observations as the training set. The rest 62 observations as the 
#test set. 

set.seed(1)
indx <- sample(1:462,size=400,replace=F)
train.data <- data[indx,]
test.data  <- data[-indx,]

#7.c) Then fit a logistic regression using 'famhist' (now become 0 and 1 binary 
#variable) as the response and all the other variables as the explanatory variables. 
fit <- glm(famhist~.,data=train.data,family="binomial")

#7.d) Make the prediction on the training and test sets. Using the 0.5 as the 
#cutoff point to get the misclassification rate on the training and test sets, respectively. 

prob.train <- fit$fit
prob.test  <- predict(fit,newdata=test.data,type="response")

tab.1 <- table(prob.train>=0.5,train.data$famhist)
tab.1

##        
##           0   1
##   FALSE 191  81
##   TRUE   46  82

1-sum(diag(tab.1))/sum(tab.1)

## [1] 0.3175

tab.2 <- table(prob.test>=0.5,test.data$famhist)
tab.2

##        
##          0  1
##   FALSE 25 18
##   TRUE   8 11

1-sum(diag(tab.2))/sum(tab.2)

## [1] 0.4193548

#7.d) Make the prediction on the training and test sets. Using the 0.5 as the 
#cutoff point to get the misclassification rate on the training and test sets, respectively. 

library(AUC)

## AUC 0.3.0

## Type AUCNews() to see the change log and ?AUC to get an overview.

auc.score <- auc(roc(prob.test,factor(test.data$famhist)))
auc.score

## [1] 0.6750261

spe <- specificity(prob.test,factor(test.data$famhist))$measure
sen <- sensitivity(prob.test,factor(test.data$famhist))$measure
plot(1-spe,sen,xlab="1-Specificity",ylab="Sensitivity",type="l",col="red",lwd=2)

Homework II

Bin Li

EXST4142 & EXST7142

Exercises on Slide 21-22