Saturday, February 18, 2023

Data Transformation in R

#---- import data file (data_file )

# set working directory

setwd("C:/Users/khurr/Desktop/khurram")

data_khurram=read_excel("data_khurram_pot.xlsx", sheet = "pot_plant",na="***")

data_khurram_soil=read_excel("data_khurram_pot.xlsx", sheet = "pot_soil",na="***")

# data frame

data_khurram=data.frame(data_khurram)

# check structure of data

str(data_khurram)

# convrt treatment variables into factors

data_khurram<- within(data_khurram, {

    treatment<- factor(treatment) 

  replication<- factor(replication)

  

})

str(data_khurram)

#-----------------------------------------------------------------------------------------------------

#---- plant_height



#=== plant_height-Anova


plant_height <- data_khurram


plant_height.rlm <- lm(plant_height~treatment + replication,

                       data=plant_height)


anova(plant_height.rlm)


plant_height.p <- round(anova(plant_height.rlm)[,5],2)


#-----------------------------------------------------------------------------------------------------------------

#--------------- normality test

#----------------------------------------------------------------------------------------------------------------

plant_height.rout <- resid(plant_height.rlm, type = "pearson")

summary(plant_height.rout)

plant_height.rout[which(abs(plant_height.rout)>2.5)]


shapiro.test(residuals(plant_height.rlm)) 

par(mfrow=c(2,2))

plot(fitted.values(plant_height.rlm), resid(plant_height.rlm, type = "pearson"), 

     col="blue");abline(h=0)

hist(residuals(plant_height.rlm))

qqnorm(residuals(plant_height.rlm));qqline(residuals(plant_height.rlm))

qqPlot(residuals(plant_height.rlm), dist= "norm", col=palette()[1], 

       ylab="Residual Quantiles", main="Normal Probability Plot", 

       pch=19)


#=======================================================================================================

#-------------------------Data transformation ways------------------------------------------

#=======================================================================================================


#----- box plot transformation


###___

# tplant_height <- with(plant_height,boxcox(plant_height~treatment + replication,

#                                           lambda = seq(-2, 2, length = 100)))

# tmplant_height <- tplant_height$x[which.max(tplant_height$y)]

# tmplant_height2 <- (tmplant_height-1)/tmplant_height

# tmplant_height;tmplant_height2


#----------------------------------------------------------------------------------------------

#------ skewed transformation

#----------------------------------------------------------------------------------------------


#-------------------The direction of skewness is given by the sign of the skewness coefficient:


#---------A zero means no skewness at all (normal distribution).

#---------A negative value means the distribution is negatively skewed.

#---------A positive value means the distribution is positively skewed.


#----------------The larger the value of skewness, the larger the distribution differs from a normal distribution.


skewness(plant_height$plant_height)


#------------- One method (square-root for moderate skew)-----------



#sqrt(x)               #for positively skewed data,


#sqrt(max(x+1) - x)     # for negatively skewed data


#------------- Second method (log for greater skew:)-----------



#log10(x)             #for positively skewed data


#log10(max(x+1) - x)  #for negatively skewed data



#------------- third method (inverse for severe skew:)-----------


#1/x for              #positively skewed data


#1/(max(x+1) - x)     #for negatively skewed data


#------------- Linearity and heteroscedasticity:


#First try log transformation in a situation where the dependent variable starts to increase more rapidly with 

#increasing independent variable values


#If your data does the opposite - dependent variable values decrease more rapidly with increasing independent variable values - you can first consider

#a square transformation.

#===========================================================================================================

#----------------------- Adapted as per conditions of data----------------------------------

#=========================================================================================================

#------------- third method (inverse for severe skew:)-----------


#1/(max(x+1) - x)     #for negatively skewed data


#-------------------------------------------------------------------------------------------------

#plant_height$plant_height2= ( 1/plant_height$plant_height ) # positively skewed data

plant_height$plant_height2= 1/(max(plant_height$plant_height+1) - plant_height$plant_height) # negatively skewed data 

#----------------------------------------------------------------------------------------------------------------------

#-- Check data normality after data transformation

#-------------------------------------------------------------------------------------------------------


plant_height2 <- plant_height


plant_height2.rlm <- lm(plant_height2~treatment + replication,

                        data=plant_height2)


anova(plant_height2.rlm)


plant_height.p <- round(anova(plant_height2.rlm)[,5],2)


#-- normality


plant_height2.rout <- resid(plant_height2.rlm, type = "pearson")

summary(plant_height2.rout)

plant_height2.rout[which(abs(plant_height2.rout)>2.5)]


shapiro.test(residuals(plant_height2.rlm)) 

par(mfrow=c(2,2))

plot(fitted.values(plant_height2.rlm), resid(plant_height2.rlm, type = "pearson"), 

     col="blue");abline(h=0)

hist(residuals(plant_height2.rlm))

qqnorm(residuals(plant_height2.rlm));qqline(residuals(plant_height2.rlm))

qqPlot(residuals(plant_height2.rlm), dist= "norm", col=palette()[1], 

       ylab="Residual Quantiles", main="Normal Probability Plot", 

       pch=19)

#-------------------------------------------------------------------------------------------------


#--- Before data transformation


#-----------------------------------------------------------------

aov.out<- aov(plant_height~treatment, data=plant_height)


LSD.test(aov.out,"treatment", console = TRUE)

#--------------------------------------------------------------------

#--- After data transformation

#-------------------------------------------------------------------

aov.out<- aov(plant_height2~treatment, data=plant_height)


LSD.test(aov.out,"treatment", console = TRUE)

#------------------------------------------------------------------




No comments:

Post a Comment