Document complémentaire au module 8 du cours SDD I de 2025-2026. Distribué sous licence CC BY-NC-SA 4.0.

Veuillez vous référer au cours en ligne pour les explications et les interprétations de cette analyse.

Installer un environnement R adéquat pour reproduire cette analyse.

# Initie le dialecte SciViews::R avec le module d'inférence
SciViews::R("infer", lang = "fr")

Calculs avec la distribution t de Student

# Probabilité de l'aire à droite du quantile 8.5, t(n = 9; mu = 8, sd = 2/3)
mu <- 8
s <- 2/3
pt((8.5 - mu)/s, df = 8, lower.tail = FALSE)
## [1] 0.2373656
# Quantile définissant une aire à gauche de 5%, t(n = 9; mu = 8, sd = 2/3)
mu <- 8
s <- 2/3
mu + s * qt(0.05, df = 8, lower.tail = TRUE)
## [1] 6.760301
# Résolution des deux questions précédentes en utilisant dist_student()
student_t <- dist_student_t(df = 8, mu = 8, sigma = 2/3)
# Aire à droite du quantile 8.5 (1 - x car cdf() renvoie toujours l'aire à gauche)
1 - cdf(student_t, 8.5)
## [1] 0.2373656
# Quantile délimitant une aire à gauche de 5%
quantile(student_t, 0.05)
## [1] 6.760301
# Aire à gauche de 8 - 0.5 + à droite de 8 + 0.5
student_t <- dist_student_t(df = 8, mu = 8, sigma = 2/3)
(left_area <- cdf(student_t, 7.5))
## [1] 0.2373656
(right_area <- 1 - cdf(student_t, 8.5))
## [1] 0.2373656
left_area + right_area
## [1] 0.4747312
# Idem, mais de manière plus concise
cdf(student_t, 7.5) * 2
## [1] 0.4747312
# Quantile à gauche
quantile(student_t, 0.025)
## [1] 6.462664
# Quantile à droite
quantile(student_t, 1 - 0.025)
## [1] 9.537336

Test tt de Student

crabs <- read("crabs", package = "MASS", lang = "fr")
skimr::skim(crabs)
Data summary
Name crabs
Number of rows 200
Number of columns 8
_______________________
Column type frequency:
factor 2
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1 FALSE 2 B: 100, O: 100
sex 0 1 FALSE 2 F: 100, M: 100

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
index 0 1 25.50 14.47 1.0 13.00 25.50 38.00 50.0 ▇▇▇▇▇
front 0 1 15.58 3.50 7.2 12.90 15.55 18.05 23.1 ▂▆▇▆▃
rear 0 1 12.74 2.57 6.5 11.00 12.80 14.30 20.2 ▂▆▇▃▁
length 0 1 32.11 7.12 14.7 27.28 32.10 37.23 47.6 ▂▆▇▇▃
width 0 1 36.41 7.87 17.1 31.50 36.80 42.00 54.6 ▂▆▇▇▂
depth 0 1 14.03 3.42 6.1 11.40 13.90 16.60 21.6 ▂▅▇▆▂
chart(data = crabs, rear ~ sex) +
  geom_boxplot()

# Test t de Student : moyenne de la largeur arrière de catace en fonction du sexe
t.test(data = crabs, rear ~ sex,
  alternative = "two.sided", conf.level = 0.95, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  rear by sex
## t = 4.2896, df = 198, p-value = 2.797e-05
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  0.8087907 2.1852093
## sample estimates:
## mean in group F mean in group M 
##          13.487          11.990
# Variante de Welch du test t de Student avec variances inégales
t.test(data = crabs, rear ~ sex,
  alternative = "two.sided", conf.level = 0.95, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  rear by sex
## t = 4.2896, df = 187.76, p-value = 2.862e-05
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  0.8085599 2.1854401
## sample estimates:
## mean in group F mean in group M 
##          13.487          11.990
# Test t de Student unilatéral à droite avec variances inégales
t.test(data = crabs, rear ~ sex,
  alternative = "greater", conf.level = 0.95, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  rear by sex
## t = 4.2896, df = 187.76, p-value = 1.431e-05
## alternative hypothesis: true difference in means between group F and group M is greater than 0
## 95 percent confidence interval:
##  0.9201205       Inf
## sample estimates:
## mean in group F mean in group M 
##          13.487          11.990
# Calcul de delta f-r et de t_obs pour le test t apparié
crabs %>.%
  smutate(., delta_f_r = front - rear) %>.%
  ssummarise(.,
    mean_f_r = fmean(delta_f_r),
    se_f_r   = fsd(delta_f_r) / sqrt(fnobs(delta_f_r))) %>.%
  smutate(., t_obs = mean_f_r / se_f_r)
## # A data.trame: [1 × 3]
##   mean_f_r se_f_r t_obs
##      <dbl>  <dbl> <dbl>
## 1     2.84  0.112  25.3
# Valeur p de ce test
pt(25.324, df = 199, lower.tail = FALSE) * 2
## [1] 3.667686e-64
# Graphique largeur arrière en fonction de la largeur à l'avance
chart(data = crabs, rear ~ front) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0)

# Test t de Student apparié
t.test(crabs$front, crabs$rear,
  alternative = "two.sided", conf.level = 0.95, paired = TRUE)
## 
##  Paired t-test
## 
## data:  crabs$front and crabs$rear
## t = 25.324, df = 199, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  2.623004 3.065996
## sample estimates:
## mean difference 
##          2.8445
# Exemple d'un jeu de données encodé incorrectement
sleep <- read("sleep", package = "datasets")
tabularise(sleep)

extra

group

ID

0.7

1

1

-1.6

1

2

-0.2

1

3

-1.2

1

4

-0.1

1

5

3.4

1

6

3.7

1

7

0.8

1

8

0.0

1

9

2.0

1

10

1.9

2

1

0.8

2

2

1.1

2

3

0.1

2

4

-0.1

2

5

4.4

2

6

5.5

2

7

1.6

2

8

4.6

2

9

3.4

2

10

# Restructuration du tableau pour un test t apparié
sleep2 <- spivot_wider(sleep, names_from = group, values_from = extra)
names(sleep2) <- c("id", "med1", "med2")
tabularise(sleep2)

id

med1

med2

1

0.7

1.9

2

-1.6

0.8

3

-0.2

1.1

4

-1.2

0.1

5

-0.1

-0.1

6

3.4

4.4

7

3.7

5.5

8

0.8

1.6

9

0.0

4.6

10

2.0

3.4

# Graphique de ces données
chart(data = sleep2, med2 ~ med1) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0)

# Test t de Student apparié
t.test(sleep2$med1, sleep2$med2,
  alternative = "two.sided", conf.level = 0.95, paired = TRUE)
## 
##  Paired t-test
## 
## data:  sleep2$med1 and sleep2$med2
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -2.4598858 -0.7001142
## sample estimates:
## mean difference 
##           -1.58
# Graphique de med1
chart(data = sleep2, med1 ~ "") +
  geom_boxplot() +
  geom_hline(yintercept = 0, col = "red") +
  xlab("") +
  ylab("Sommeil supplémentaire avec med1 [h]")

# Test t de Student univarié sur med1
t.test(sleep2$med1,
  alternative = "two.sided", mu = 0, conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  sleep2$med1
## t = 1.3257, df = 9, p-value = 0.2176
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.5297804  2.0297804
## sample estimates:
## mean of x 
##      0.75

Test de Wilcoxon

# Test de Wilcoxon-Mann-Whitney univarié sur med1
wilcox.test(sleep2$med1,
  alternative = "two.sided", mu = 0, conf.level = 0.95)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sleep2$med1
## V = 31, p-value = 0.3433
## alternative hypothesis: true location is not equal to 0
# Test de Wilcoxon-Mann-Whitney indépendant unilatéral à droite
wilcox.test(data = crabs, rear ~ sex,
  alternative = "greater", conf.level = 0.95)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  rear by sex
## W = 6710, p-value = 1.473e-05
## alternative hypothesis: true location shift is greater than 0

Puissance d’un test et représentation graphique

# Calcul de puissance d'un test t de Student
pwr::pwr.t.test(n = 10, d = 1.3, sig.level = 0.05,
  type = "one.sample", alternative = "two.sided")
## 
##      One-sample t test power calculation 
## 
##               n = 10
##               d = 1.3
##       sig.level = 0.05
##           power = 0.9538774
##     alternative = two.sided
# Représentation graphique des données pour un test t de Student ou de Wilcoxon
a <- chart(data = crabs, rear ~ sex) +
  stat_summary(geom = "col", fun = "mean") +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_cl_normal", fun.args = list(conf.int = 0.95))

b <- chart(data = crabs, rear ~ sex) +
  geom_jitter(alpha = 0.3, width = 0.2) +
  stat_summary(geom = "point", fun = "mean", size = 2) +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_cl_normal", fun.args = list(conf.int = 0.95), linewidth = 1)

combine_charts(list(a,b))

# Meilleure représentation graphique pour un test de Wilcoxon-Mann-Withney
chart(data = crabs, rear ~ sex) +
  geom_boxplot()

# Meilleure représentation graphique pour un test t de Student
chart(data = crabs, rear ~ sex) +
  geom_jitter(alpha = 0.3, width = 0.2) +
  stat_summary(geom = "point", fun = "mean") +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_cl_normal", fun.args = list(conf.int = 0.95))

# 4 graphiques avec différentes barres d'erreurs (IC95%, écart type, 2*écart type et erreur standard)
p <- chart(data = crabs, rear ~ sex) +
  geom_jitter(alpha = 0.1, width = 0.2) +
  stat_summary(geom = "point", fun = "mean") +
  scale_y_continuous(limits = c(5,22))

a <- p +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_cl_normal", fun.args = list(conf.int = 0.95)) 

b <- p +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_sdl", fun.args = list(mult = 1))

c <- p +
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_sdl", fun.args = list(mult = 2)) 

d <- p + 
  stat_summary(geom = "errorbar", width = 0.1,
    fun.data = "mean_se", fun.args = list(mult = 1))

combine_charts(list(a,b,c,d))