Phoneme Classification Example

Data Description

This dataset was originally used as an illustration in the Penalized Discriminant Analysis by Hastie, Buja and Tibshirani (1995). The data were extracted from the TIMIT database (TIMIT Acoustic-Phonetic Continuous Speech Corpus, NTIS, US Dept of Commerce) which is a widely used resource for research in speech recognition. A dataset was formed by selecting five phonemes for classification based on digitized speech from this database. The phonemes are transcribed as follows: "sh" as in "she", "dcl" as in "dark", "iy" as the vowel in "she", "aa" as the vowel in "dark", and "ao" as the first vowel in "water". From continuous speech of 50 male speakers, 4509 speech frames of 32 msec duration were selected, approximately 2 examples of each phoneme from each speaker. Each speech frame is represented by 512 samples at a 16kHz sampling rate, and each frame represents one of the above five phonemes.

From each speech frame, we computed a log-periodogram, which is one of several widely used methods for casting speech data in a form suitable for speech recognition. Thus the data used in what follows consist of 4509 log-periodograms of length 256, with known class (phoneme) memberships.

The data contain 256 columns labelled "x.1" - "x.256", a response column labelled "g", and a column labelled "speaker" identifying the diffferent speakers.

#You can import data directly from the EOSL website
#loc1="https://web.stanford.edu/~hastie/ElemStatLearn/datasets/phoneme.data"
loc1 <- "C:/Users/bli/Desktop/AppStat/datasets/phoneme.DATA"
tmp1 <- as.matrix(read.table(loc1,header=T,sep=",")) #259: speaker info
y = factor(tmp1[,258])      #membership vector
x = as.matrix(tmp1[,2:257]) #log-periodogram matrix
mode(x)="numeric"
dim(x)

## [1] 4509  256

table(y)

## y
##   aa   ao  dcl   iy   sh 
##  695 1022  757 1163  872

Data Visualization

Let's see the data .

#Visualize raw data
col=as.numeric(y)
matplot(1:256,t(x[1:20,]),col=col[1:20],type="l",
        xlab="Frequency",ylab="Log-periodogram")
legend("topright",legend=levels(y),lty=1,col=1:5)

#PCA
pca=princomp(x[1:1000,],cor=F)
plot(pca$score[,1:2],col=col[1:1000],xlab="PC1 (52.9%)",ylab="PC2 (18.0%)",
     pch=col[1:1000],cex=0.7)
legend("topleft",legend=levels(y),pch=1:5,col=1:5)

Penalized Discriminant Analysis (PDA)

Let's first try the penalized discriminant analysis using generalized ridge penalty and visualize the first two penalized canonical variates.

library(mda)
N=nrow(x) #4509

#use small training size as an illustration
data=as.data.frame(x); data$Y=y #full dataset
n=500  
set.seed(1)
id=sample(1:N,size=n,replace=F)

pda.fit <- mda(Y~.,data=data[id,], method=gen.ridge)
pda.prd <- predict(pda.fit,data[-id,],type="class")
#Confusion table
table(pda.prd,data$Y[-id])

##        
## pda.prd   aa   ao  dcl   iy   sh
##     aa   393  208    1    0    1
##     ao   232  705    0    0    0
##     dcl    0    0  649   28    0
##     iy     0    0   12 1008    0
##     sh     0    0    0    1  771

#Misclassification rate
1-sum(pda.prd==data$Y[-id])/length(data$Y[-id])

## [1] 0.1204789

#Visualize the first two penalized canonical variates 
pda.var=predict(pda.fit,data[id,],type="variates",dimension=2)
plot(pda.var,xlab="Dim 1",ylab="Dim 2",col=col[id],pch=col[id],cex=0.7)
legend("bottomright",legend=levels(y),pch=1:5,col=1:5,cex=1.5)

Flexible Discriminant Analysis (FDA)

Let's apply the flexible discriminant analysis using the MARS (Multivariate Adaptive Regression Splines, Friedman (1991)) basis.

fda.fit <- fda(Y~.,data=data[id,], method=mars)
fda.prd <- predict(fda.fit,data[-id,],type="class")
#Confusion table
table(fda.prd,data$Y[-id])

##        
## fda.prd   aa   ao  dcl   iy   sh
##     aa   443  178    0    0    0
##     ao   181  731    0    1    0
##     dcl    1    2  653    2    0
##     iy     0    2    7 1033    0
##     sh     0    0    2    1  772

#Misclassification rate
1-sum(fda.prd==data$Y[-id])/length(data$Y[-id])

## [1] 0.09403841

plot(fda.fit,pcex=1)
legend("bottomright",legend=levels(y),pch=c("1","2","3","4","5"),cex=1.5)

We can also use the BRUTO basis in FDA.

fda.fit2 <- fda(Y~.,data=data[id,], method=bruto)
fda.prd2 <- predict(fda.fit2,data[-id,],type="class")
#Confusion table
table(fda.prd2,data$Y[-id])

##         
## fda.prd2   aa   ao  dcl   iy   sh
##      aa   459  177    0    0    0
##      ao   166  735    0    1    0
##      dcl    0    0  644    2    0
##      iy     0    1   16 1033    3
##      sh     0    0    2    1  769

#Misclassification rate
1-sum(fda.prd2==data$Y[-id])/length(data$Y[-id])

## [1] 0.0920429

plot(fda.fit2,pcex=1)
legend("topleft",legend=levels(y),pch=c("1","2","3","4","5"),cex=1.5)

Linear Discriminant Analysis (LDA)

As a reference, let's see the LDA's performance.

library(MASS)
lda.fit <- lda(x[id,],y[id]) 
lda.prd <- predict(lda.fit,x[-id,]) 
#Confusion table
table(lda.prd$class,y[-id])

##      
##         aa   ao  dcl   iy   sh
##   aa   435  215    1    0    1
##   ao   190  698    0    0    0
##   dcl    0    0  645   22    0
##   iy     0    0   14 1014    1
##   sh     0    0    2    1  770

#Misclassification rate
1-sum(lda.prd$class==y[-id])/length(y[-id])

## [1] 0.1114991

lda.var <- predict(lda.fit,dimen=2)$x 
plot(lda.var,xlab="Dim 1",ylab="Dim 2",col=col[id],pch=col[id],cex=0.7)
legend("topleft",legend=levels(y),pch=1:5,col=1:5,cex=1.5)

Reference

Hastie, T., Buja, A. and Tibshirani, R. (1995) "Penalized Discriminant Analysis", Annals of Statistics, 23(1), 73-102.

Friedman, J. (1991) "Multivariate Adaptive Regression Splines" (with discussion), Annals of Statistics, 19(1), 1-141.