Modèle de score de propension
Méthode Covariables
Hirano-Imbens MEDIA_COST*_B ; MEDIA_COST*_C ; P_POP_4564 ;
p_const2011_2016 ; CRP3_SCORE_STD
- SCMM MEDIA_COST_B ; MEDIA_COST_C ; P_POP_4564 ;
- Moodie-Stephens, avec imputation
p_const2011_2016 ; CRP3_SCORE_STD ; - Moodie-Stephens, trai-
tement avec masse à 0
Annexe B
Code R
#1) Analyse agregee
#a) On moyenne nos donnees sur les 9 semaines en question
DATA_MEAN0 <- DATA %>%
#on filtre sur l' annee 2018 , les semaines de 8 a 16
filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>% group_by( GEO_RTA ) %>% summarise_at( vars ( vars_media ), # vars_media nos variables d' investissement
funs ( mean (.) )) %>% as. data . frame ()
#b) Classification K- moyennes
K_means_vars <-c(" MEDIA_COST_B","P_marie ","P_immigr ","P_mater_fr") MEDIA_A_kmean <- kmeans (
x = Kmeans_data [,K_means_vars ], centers = 5,
nstart = 50)
MEDIA_A_kmean$size # 136
MEDIA_A_kmean$withinss/MEDIA_A_kmean$size
k = which . min ( MEDIA_A_kmean$withinss / MEDIA_A_kmean$size )
# ** Dataset final forme de 136 RTA homogenes
# Definir les groupes de RTA HIGH et LOW en terme de MEDIA_A
ngroup <- floor ( MEDIA_A_kmean$size [k] / 3) DATA_CLUST1 <- DATA_MEAN0 %>%
filter ( CLUSTER == k) %>% arrange ( MEDIA__COST_A) %>% mutate (
Group = c( rep ('LOW ', ngroup ), rep ('MIDDLE ', phone_kmean$size [k] - 2*ngroup ), rep ('HIGH ', ngroup )),
TREATED = ifelse ( Group == 'LOW ', 0,
#c) Application de la methode Hirano - Imbens #i) Modele de score de propension
vars_drf_1 <-c('MEDIA \_COST*\_B',
'MEDIA \_COST*\_C ','P_POP_4564 ',
'p_const2011_2016 ', 'CRP3_SCORE_STD ')
formule_drf_1 <- paste ('MEDIA \_COST*\_B ~', paste ( vars_drf_1, collapse =
' + '))
modele1_gamma <- glm (as. formula ( formule_drf_1) , family = Gamma ( link = " inverse "), data = DATA_CLUST1 )
summary ( modele1_gamma ) # parametre de dispersion 0.0326 # Estimation du GPS
#1. Definition de la grille de longueur 93 pour la variable MEDIA \_COST*\_A
grille1 <- quantile ( DATA_CLUST1$MEDIA \_COST*\_A, probs = seq (0.03 , .95 , by = 0.01) )
#2. Estimation de l' esperance
N <- nrow ( DATA_CLUST1 ) L <- length ( grille1 )
mu_i1 <- 1/( predict . glm ( modele1_gamma ))
#3. Estimation de la variance
phi <- summary ( modele1_gamma )$dispersion var_est_i <- phi * (mu_i ^2)
#4. Calcul de l' estimation du GPS comme densite de probabilite #mu_i = ka*teta_i , var_i = ka*( teta_i ^2)
ka <-1/phi
teta_i <- mu_i1 / ka
# GPS
GPS<- dgamma ( DATA_CLUST1$MEDIA \_COST*\_A, scale = teta_i, shape = ka )
#ii) Modele de SOUM*
modele2_gamma <- lm( SOUM* ~ MEDIA \_COST*\_$A$+ I( MEDIA \_COST*\_$A$ ^2 )+ GPS +
I( GPS ^2) + MEDIA \_COST*\_$A$ * GPS , data = DATA_CLUST1 ) summary ( modele2_gamma )
alphas <- modele2_gamma$coefficients
# iii ) Estimation de la fonction la dose - reponse et sa derivee #on definit d' abord une matrice GPS
gps1_gamma_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
gps1_gamma_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i ^ka))) * exp (- grille1 [j]
/ teta_i ) *( grille1 [j ]^( ka -1) ) }
# ADRF
result_grille <- rep (0, L)
for(t in 1:L){
list_rta <- rep (0, N)
for( rta in 1:N){
list_rta [ rta ] = alphas [1] + alphas [2]*grille1 [t] + alphas [3]*grille1 [t]^2 + alphas [4]*gps1_gamma_matrix [rta ,t] + alphas [5]*gps1_gamma_matrix [rta ,t
]^2 + alphas [6]*grille1 [t]*gps1_gamma_matrix [rta ,t]} result_grille [t] = 1/N * sum ( list_rta )
}
hi_clust<- result_grille
# Derivee du GPS
gps_deriv<- ( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( DATA_CLUST1
$MEDIA \_COST*\_A) / teta_i )*(( DATA_CLUST1$MEDIA \_COST*\_A)^(ka -1) )) + ((ka -1)*(( DATA_CLUST1$MEDIA \_COST*\_A)^(ka -2) )*exp (-( DATA_CLUST1$MEDIA \_
COST*\_A) / teta_i)))
gps_deriv_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
gps_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( grille1 [j]) / teta_i)*(( grille1 [j]) ^(ka -1) )) +
((ka -1)*(( grille1 [j]) ^(ka -2) )*exp (-( grille1 [j]) / teta_i)))}
# Derivee de l' ADRF
result_hi_derivee <- rep (0, L)
for(t in 1:L){
list_rta <- rep (0, N)
for( rta in 1:N){
list_rta [ rta ] = alphas [2] + 2*alphas [3]*grille1 [t] + alphas [4]*gps_deriv_
matrix [rta ,t] + 2*alphas [5]*gps1_gamma_matrix [rta ,t]*gps_deriv_matrix [rta ,
t] +
alphas [6]*( grille1 [t]*gps_deriv_matrix [rta ,t] + gps1_gamma_matrix [rta ,t])} result_hi_derivee [t] = 1/N * sum ( list_rta )}
hi_derivee<-result_hi_derivee
#d) Validation de la propriete de balance du GPS
# Repartir MEDIA \_COST*\_A par tertiles en 3 intervalles g
DATA_BALANCED1 <- DATA_CLUST1 %>% arrange ( MEDIA \_COST*\_A) %>%
mutate ( GROUP = ifelse ( MEDIA \_COST*\_$A$ < quantile ( MEDIA \_COST*\_A, .33) , 'g1 '
. ifelse ( MEDIA \_COST*\_$A$ > quantile ( MEDIA \_COST*\_$A$, .66) , ' g3 ', 'g2 '))) %>% as. data . frame ()
#i) Avant ajustement pour le GPS ( unadjusted )
# 5 variables x 3 groupes = 15 t- tests
ttest_unadjusted <- DATA_BALANCED1 %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g1"], .[ GROUP != "g1"])$statistic )) %>%
mutate ( GROUP = 'g1_MEDIA \_A') %>% bind_rows (
DATA_BALANCED1 %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g2"], .[ GROUP != "g2"])$statistic )) %>%
mutate ( GROUP = 'g2_MEDIA \_A') ) %>%
bind_rows (
DATA_BALANCED1 %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g3"], .[ GROUP != "g3"])$statistic )) %>%
mutate ( GROUP = 'g3_MEDIA \_A') ) %>%
gather (-GROUP , key = Xi , value = ttest ) %>% spread ( key = GROUP , value = ttest ) %>% mutate_at( vars (-Xi), list (~round (., 2))) %>% as_tibble ()
ttest_unadjusted #on obtient le tableau des statistiques t d' egalite des moyennes avant ajustement pour le GPS
#ii) Apres ajustement pour le GPS ( adjusted )
# a) on calcule le traitement m d i a n par intervalle g
MEDIA \_A_median <- DATA_BALANCED1 %>% group_by( GROUP ) %>% summarise ( median = median ( MEDIA \_COST*\_A)) %>% .$median
# b) on calcule un vecteur median de GPS par intervalle g
DATA_BALANCED1 <- DATA_BALANCED1 %>% mutate (
gps_median1 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [1]/
teta_i) * ( MEDIA \_A_median [1]^{ ka -1}) ,
gps_median2 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [2]/
teta_i) * ( MEDIA \_A_median [2]^{ ka -1}) ,
gps_median3 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [3]/
teta_i) * ( MEDIA \_A_median [3]^{ ka -1}) )
# c) on separe ces GPS en 4 blocs
library( forcats ) nblocks <- 4
DATA_BALANCED1 <- DATA_BALANCED1 %>% mutate (
gps_block_from_m1 = cut ( gps_median1 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )),
gps_block_from_m2 = cut ( gps_median2 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )),
gps_block_from_m3 = cut ( gps_median3 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )))
# d) t- test entre les blocs (un bloc contre les autres )
ttest_adjusted <- DATA_BALANCED1 %>% group_by( gps_block_from_m1) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g1"], .[ GROUP != "g1"])$statistic )) %>%
left_join (count( DATA_BALANCED1 , gps_block_from_m1)) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g1_MEDIA \_$A$') %>%
bind_rows (
DATA_BALANCED1 %>%
group_by( gps_block_from_m2) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g2"], .[ GROUP != "g2"])$statistic )) %>%
left_join (count( DATA_BALANCED1 , gps_block_from_m2)) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g2_MEDIA \_$A$')
) %>% bind_rows (
DATA_BALANCED1 %>%
group_by( gps_block_from_m3) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g3"], .[ GROUP != "g3"])$statistic )) %>%
ungroup () %>%
left_join (count( DATA_BALANCED1 , gps_block_from_m3)) %>%
summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g3_MEDIA \_$A$')
) %>%
gather (-GROUP , key = Xi , value = ttest ) %>% spread ( key = GROUP , value = ttest ) %>% mutate_at( vars (-Xi), list (~round (., 2)))
ttest_adjusted #on obtient le tableau des statistiques t d' egalite des moyennes apres ajustement pour le GPS .
##--- Bootstrap ---
vars_drf_1 <- c('MEDIA \_COST*\_B ','MEDIA \_COST*\_C', 'P_POP_4564 ', 'p_ const2011_2016 ', 'CRP3_SCORE_STD ')
formule_drf_1 <- paste ('MEDIA \_COST*\_A ~', paste ( vars_drf_1, collapse = ' + '))
iter<- 1000
result_boot_hi <- list () result_boot_hi_deriv<- list () grille_hi<- list ()
for(i in 1: iter ){ print (i)
rta_sample<- sample (rtas , size =120)
DATA_SAMPLE<- DATA_CLUST1 %>% filter ( GEO_RTA %in% rta_sample )
modele1_gps <- glm (as. formula ( formule_drf_1) , family = Gamma ( link = " inverse "), data = DATA_SAMPLE )
# estimation du GPS
grille1_sample <- quantile ( DATA_SAMPLE$MEDIA \_COST*\_$A$, probs = seq (0.03 , .95 , by = 0.01) )
# estimation de l' esperance
L <- length ( grille1_sample ) N<- nrow ( DATA_SAMPLE )
mu_i <- 1/( predict . glm ( modele1_gps , newdata = DATA_SAMPLE ))
# estimation de la variance
phi <- summary ( modele1_gps )$dispersion var_est_i <- phi * (mu_i ^2)
ka <-1/phi
teta_i <- mu_i / ka
gps_sample<-dgamma ( DATA_SAMPLE$MEDIA \_COST*\_A, scale = teta_i, shape = ka )
gps1_gamma_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
gps1_gamma_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i ^ka))) * exp (- grille1
_sample [j] / teta_i ) *( grille1_sample [j ]^( ka -1) ) }
gps1_gamma = apply( gps1_gamma_matrix , 2, function(x) mean (x))
# Derivee du GPS
gps_deriv_sample<- ( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( DATA_SAMPLE$MEDIA \_COST*\_A) / teta_i )*(( DATA_SAMPLE$MEDIA \_COST*\_A)^(ka -1) )) + ((ka -1)*(( DATA_SAMPLE$MEDIA \_COST*\_A)^(ka -2) )*exp (-( DATA_SAMPLE$
MEDIA \_COST*\_A) / teta_i)))
gps_deriv_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
gps_deriv_matrix [,j] <-
( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1/teta_i)*exp (-( grille1_sample [j])
/ teta_i)*(( grille1_sample [j]) ^(ka -1) )) + ((ka -1)*(( grille1_sample [j]) ^(ka -2) )*
exp (-( grille1_sample [j]) / teta_i))) }
DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( GPS = gps_sample )
modele2_gamma <- lm( SOUM* ~ MEDIA \_COST*\_A + I( MEDIA \_COST*\_$A$ ^2 )+ GPS + I( GPS ^2) + MEDIA \_COST*\_A * GPS , data = DATA_SAMPLE )
# ADRF
alphas<- coef ( modele2_gamma ) result_sample <- rep (0, L)
for(t in 1:L){
list_rta_gps <- rep (0, N)
for( rta in 1:N){
list_rta_gps [ rta ] = alphas [1] + alphas [2]*grille1_sample [t] +
alphas [3]*grille1_sample [t]^2 + alphas [4]*gps1_gamma_matrix [rta ,t] + alphas [5]*gps1_gamma_matrix [rta ,t]^2 + alphas [6]*grille1_sample [t]*
gps1_gamma_matrix [rta ,t] }
result_sample [t] = 1/N * sum ( list_rta_gps ) }
result_boot_hi [[i]]<- result_sample grille_hi [[i]] <- grille1_sample
# Derivee de l' ADRF
result_sample_deriv<- rep (0, L)
for(t in 1:L){
list_rta_gps <- rep (0, N)
for( rta in 1:N){
list_rta_gps [ rta ] = alphas [2] + 2*alphas [3]*grille1_sample [t] + alphas [4]*gps_deriv_matrix [rta ,t] +
2*alphas [5]*gps1_gamma_matrix [rta ,t]*
gps_deriv_matrix [rta ,t] + alphas [6]*( grille1_sample [t]*
gps_deriv_matrix [rta ,t] + gps1_gamma_matrix [rta ,t]) }
result_sample_deriv [t] = 1/N *sum ( list_rta_gps ) }
result_boot_hi_deriv [[i]] <- result_sample_deriv }
#2) Analyse longitudinale
##On garde les valeurs des 9 semaines , annee 2018
DATA_LONG2018 <- DATA %>% filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>%
filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>% mutate ( URBAIN = ifelse ( grepl (0, GEO_RTA ) == T, 0, 1)) %>% mutate_at(
vars ( vars_media ), funs (. / GEO_NBF_2016) )%>%
select (YEAR , WEEKNUM , GEO_RTA , GEO_POP , GEO_NBF_2016 , URBAIN , vars_media ) DATA_LONG2018 <- DATA_LONG2018 %>% #on la merge avec statcan
left_join ( STATCAN , by = c('GEO_RTA ', 'GEO_NBF_2016 ', 'URBAIN '))%>%
na.omit() %>% as. data . frame ()
# ** Cluster de 1224: 136 rta repetee 9 fois
LONG <- filter ( DATA_MEAN0 , CLUSTER == k) %>% select ( GEO_RTA )
DATA_CLUST1_LONG2018<- DATA_LONG2018 %>% filter ( GEO_RTA %in% LONG$GEO_RTA )
# ** Introduire les variables retardees
DATA_CLUST1_LONG2018 = DATA_CLUST1_LONG2018 %>% arrange ( GEO_RTA , WEEKNUM ) %>%
group_by( GEO_RTA ) %>% mutate (
SOUM_lag1 = lag (SOUM , n = 1, default = 0) ,
MEDIA_lag1_A = lag ( MEDIA_COST_A, n = 1, default = 0) ) %>% ungroup ()
DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( WEEKNUM_fac = as. factor ( WEEKNUM )) # WEEKNUM en facteur
# 2.1) Approche - SCMM
DATA_CLUST_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( MEDIA_COST_A = ifelse ( MEDIA_COST_A == 0, 0.1 , MEDIA_COST_A)) %>%
mutate ( WEEKNUM_fac = as. factor ( WEEKNUM ))
#i) Modele pour le score de propension
vars1_SCMM_fac <- c('MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ',
'p_const2011_2016 ','CRP3_SCORE_STD ','WEEKNUM_fac ',
'MEDIA_lag1_A', 'SOUM_lag1 ')
formule_gee_fac <- paste ('MEDIA \_COST \_A ~', paste ( vars1_SCMM_fac , collapse =
' + '))
mod1 <- lm(as. formula ( formule_gee_fac ), data = DATA_CLUST_LONG2018 ) summary ( mod1 )
gps_fac<- mod1$fitted . values
#ii) modele pour SOUM
vars2_SCMM <- c('MEDIA_COST_A ', 'MEDIA_lag1_A','SOUM_lag1 ', 'gps_fac ',' MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ', 'p_const2011_2016 ',
'CRP3_SCORE_STD ')
formule <- paste ('SOUM ~', paste ( vars2_SCMM , collapse = ' + ')) mod2<- lm(as. formula ( formule ), data = DATA_CLUST_LONG2018 )
#a) Moodie_Stephens , avec imputation #i) Modele pour le score de propension
vars_drf_LONG_mgps <- c('MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ',
'p_const2011_2016 ','CRP3_SCORE_STD ','WEEKNUM_fac ',
'MEDIA_lag1_A', 'SOUM_lag1 ')
formule_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse = ' + '))
modele_gamma_mgps <- glm (as. formula ( formule_mgps ), family = Gamma ( link = " inverse "), data = DATA_CLUST_LONG2018 )
summary ( modele_gamma_mgps )
# Estimation du mgps
#1. Definition de la grille de longueur 93 pour la variable MEDIA_COST_A
grille1_LONG <- quantile ( DATA_CLUST_LONG2018$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
#2. estimation de l' e s p r a n c e
L <- length ( grille1_LONG ) N<- nrow ( DATA_CLUST_LONG2018 )
mu_i1 <- 1/( predict . glm ( modele_gamma_mgps ))
#3. estimation de la variance
phi1 <- summary ( modele_gamma_mgps )$dispersion var_est_i <- phi1 * (mu_i1 ^2)
#3. Calcul de l' estimation du gps comme d e n s i t de p r o b a b i l i t #mu_i = ka*teta_i , var_i = ka*( teta_i ^2)
ka <-1/phi1
teta_i1 <- mu_i1 / ka
#4. MGPS
# mgps1<-dgamma ( DATA_CLUST_LONG2018$MEDIA_COST_A, scale = teta_i1 , shape = ka )
DATA_CLUST_LONG2018 <- DATA_CLUST_LONG2018 %>% mutate ( mgps1 = dgamma ( DATA_
CLUST_LONG2018$MEDIA_COST_A, scale = teta_i1 , shape = ka )) mgps1_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
mgps1_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i1 ^ka))) * exp (- grille1_LONG [j]
/ teta_i1 ) *( grille1_LONG [j ]^( ka -1) ) }
#5. Deriv_MGPS
mgps1_deriv<- ( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*exp (-( DATA_
CLUST_LONG2018$MEDIA_COST_A) / teta_i1 )*(( DATA_CLUST_LONG2018$MEDIA_COST_
A)^(ka -1) )) +(( ka -1)*(( DATA_CLUST_LONG2018$MEDIA_COST_A)^(ka -2) )
*exp (-( DATA_CLUST_LONG2018$MEDIA_COST_A) / teta_i1))) mgps1_deriv_matrix = matrix ( nrow =N, ncol =L)
mgps1_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*
exp (-( grille1_LONG [j]) / teta_i1 )*(( grille1_LONG [j]) ^(ka -1) )) +
((ka -1)*(( grille1_LONG [j]) ^(ka -2) )*exp (-( grille1_LONG [j]) / teta_i1))) }
#ii) Modele pour SOUM
mod_mgps <- lm( SOUM ~ MEDIA_COST_A + I( MEDIA_COST_A ^ 2) + mgps1 + MEDIA_
COST_A*mgps1 , data = DATA_CLUST_LONG2018 ) summary ( mod_mgps )
# ADRF
alphas<- coef ( mod_mgps )
result_grille_mgps<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_LONG [t] + alphas [3]* I( grille1_LONG [t]^2) + alphas [4]*mgps1_matrix [rta ,t]+ alphas [5]*grille1_LONG [t]*mgps1_matrix [rta ,t]
}
result_grille_mgps [t] = 1/N * sum ( list_rta_mgps ) }
result_mgps1<- result_grille_mgps
# Derivee de l' ADRF
result_grille_deriv<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_LONG [t] + alphas [4]*mgps1_deriv_matrix [rta ,t]+
alphas [5]*( grille1_LONG [t]*mgps1_deriv_matrix [rta ,t] + mgps1_matrix [rta ,t
]) }
result_grille_deriv [t] = 1/N * sum ( list_rta_mgps ) }
deriv1 <- result_grille_deriv
##--- Bootstrap ---
formule1_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =
' + '))
rtas<- unique( DATA_CLUST_LONG2018$GEO_RTA )
vars_drf_LONG_mgps <- c( 'MEDIA_COST_B', 'MEDIA_COST_C','P_POP_4564 ', 'p_ const2011_2016 ', 'CRP3_SCORE_STD ','WEEKNUM_fac ', 'SOUM_lag1 ','MEDIA_lag1_A ')
iter<- 1000
result_boot_ms <- list () result_boot_ms_deriv<- list () grille_ms<- list ()
for(i in 1: iter ){ print (i)
rta_sample<- sample (rtas , size =120)
DATA_SAMPLE<- DATA_CLUST_LONG2018 %>% filter ( GEO_RTA %in% rta_sample )
modele_gamma_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "), data = DATA_CLUST_LONG2018 )
# estimation du mgps
grille1_LONG_sample <- quantile ( DATA_SAMPLE$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
# estimation de l' esperance
L <- length ( grille1_LONG_sample ) N<- nrow ( DATA_SAMPLE )
mu_i1<- 1/( predict . glm ( modele_gamma_mgps , newdata = DATA_SAMPLE ))
# estimation de la variance
phi1 <- summary ( modele_gamma_mgps )$dispersion # 0.22
var_est_i <- phi1 * (mu_i1 ^2)
# Calcul de l' estimation du gps comme densite de probabilite
ka <-1/phi1
teta_i1 <- mu_i1 / ka
mgps1_SAMPLE<-dgamma ( DATA_SAMPLE$MEDIA_COST_A, scale = teta_i1 , shape = ka ) DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( mgps1 = mgps1_SAMPLE )
grille_ms [[i]] <- grille1_LONG_sample L <- length ( grille1_LONG_sample ) N<- nrow ( DATA_SAMPLE )
mgps1_matrix = matrix ( nrow =N, ncol =L)
for(j in 1:L) {
mgps1_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i1 ^ka))) * exp (- grille1_LONG_
sample [j] / teta_i1 ) *( grille1_LONG_sample [j ]^( ka -1) ) }
# Derivee du MGPS
mgps1_deriv<- ( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*exp (-( DATA_
SAMPLE$MEDIA_COST_A) / teta_i1 )*(( DATA_SAMPLE$MEDIA_COST_A)^(ka -1) )) + (( ka -1)*(( DATA_SAMPLE$MEDIA_COST_A)^(ka -2) )*exp (-( DATA_SAMPLE$MEDIA_COST_A)
/ teta_i1)))
mgps1_deriv_matrix = matrix ( nrow =N, ncol =L)
mgps1_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1 )*exp (-( grille1_LONG_sample [j]) / teta_i1 )*(( grille1_LONG_sample [j]) ^(ka -1) )) + ((ka -1)*(( grille1_LONG_sample [j]) ^(ka -2) )*exp (-( grille1_LONG_
sample [j]) / teta_i1))) }
mod1_mgps <- lm( SOUM ~ MEDIA_COST_A + I( MEDIA_COST_A ^ 2) + mgps1 + MEDIA_
COST_A*mgps1 , data = DATA_SAMPLE )
# ADRF
alphas<- coef ( mod1_mgps ) result_sample1<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_LONG_sample [t] + alphas [3]*( grille1_LONG_sample [t]^2) + alphas [4]* mgps1_matrix [rta ,t]+ alphas [5]* grille1_LONG_sample [t] * mgps1_matrix [rta ,t]
}
result_sample1 [t] = 1/N * sum ( list_rta_mgps ) }
result_boot_ms [[i]]<- result_sample1
# Derivee de l' ADRF
result_sample_deriv_ms<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_LONG_sample [t] + alphas [4]*mgps1_deriv_matrix [rta ,t]+
alphas [5]*( grille1_LONG_sample [t]*mgps1_deriv_matrix [rta ,t] + mgps1_
matrix [rta ,t]) }
result_sample_deriv_ms[t] = 1/N * sum ( list_rta_mgps ) }
result_boot_ms_deriv [[i]] <- result_sample_deriv_ms }
# b) Moodie - Stephens , traitement avec masse a 0 # i) Calcul du MGPS pour le traitement non nul
DATA_CLUST1_LONG2018_0 <- DATA_CLUST1_LONG2018 %>% filter ( MEDIA_COST_A != 0) %>% mutate ( WEEKNUM_fac = as. factor ( WEEKNUM ))
formule1_mgps <- paste (' MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =
' + '))
modele1_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "
), data = DATA_CLUST1_LONG2018_0) summary ( modele1_mgps )
# estimation du mgps
grille1_LONG_0 <- quantile ( DATA_CLUST1_LONG2018_0$ MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
L <- length ( grille1_LONG_0) N<- nrow ( DATA_CLUST1_LONG2018_0)
mu_i_0<- 1/( predict . glm ( modele1_mgps , newdata = DATA_CLUST1_LONG2018 )) phi0 <- summary ( modele1_mgps )$dispersion
var_est_i <- phi0 * (mu_i_0^2) ka <-1/phi0
teta_i0 <- mu_i_0 / ka
mgps_0<-dgamma ( DATA_CLUST1_LONG2018$MEDIA_COST_A, scale = teta_i0 , shape = ka )
#ii) Modele de melange # regression logistique
# creation d' une nouvelle var ( newvar ) qui prend 1 quand MEDIA_COST_A = 0
DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( newvar = ifelse ( MEDIA
_COST_A == 0, 1, 0))
DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( WEEKNUM_fac = DATA_
CLUST_LONG2018$WEEKNUM_fac )
formule1_mgps <- paste ('newvar ~', paste ( vars_drf_LONG_mgps , collapse = ' + ') )
# on veut predire la probabilite que newvar =1, MEDIA_COST_A = 0
model_mgps <- glm (
data = DATA_CLUST1_LONG2018 , formula = formule1_mgps ,
family = binomial ( link = " logit ") )
summary ( model_mgps )
#r_hat
DATA_CLUST1_LONG2018 <- DATA_CLUST1_LONG2018 %>% mutate (
pi_hat = round ( predict ( model_mgps , type = " response "), 10) , #pi_hat represente la probabilite que MEDIA_COST_A est nulle
DATA_CLUST1_LONG2018 <- DATA_CLUST1_LONG2018 %>% mutate ( mgps_0= mgps_0, r_hat = ifelse ( newvar ==1 , WEIGHT , WEIGHT*mgps_0))
# MGPS
grille1_mass0 <- quantile ( DATA_CLUST1_LONG2018$ MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
L <- length ( grille1_mass0 ) N<- nrow ( DATA_CLUST1_LONG2018 )
mgps0_matrix = matrix ( nrow =N, ncol =L)
mgps0_matrix [ ,1] = DATA_CLUST1_LONG2018$pi_hat
for(j in 2:L) {
mgps0_matrix [,j] <- (1 - DATA_CLUST1_LONG2018$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ka))) * exp (- grille1_mass0 [j] / teta_i0 ) *( grille1_mass0 [j ]^( ka -1) )
}
# Derivee de r_hat
mgps0_deriv_matrix = matrix ( nrow =N, ncol =L) mgps0_deriv_matrix [ ,1] = rep (0)
for(j in 2:L) {
mgps0_deriv_matrix [,j] <- (1 - DATA_CLUST1_LONG2018$pi_hat ) * ( 1 /( gamma ( ka)* ( teta_i0 ^ka)))* ((( -1 / teta_i0)*exp (-( grille1_mass0 [j]) / teta_i0 )
*(( grille1_mass0 [j]) ^(ka -1) )) + ((ka -1)*(( grille1_mass0 [ j]) ^(ka -2) )*exp (-( grille1_mass0 [j]) / teta_i0)))
}
# iii ) Modele de SOUM
modele2_mgps <- lm( SOUM ~ MEDIA_COST_A +I( MEDIA_COST_A ^2 )+ r_hat + MEDIA_
COST_A*r_hat , data = DATA_CLUST1_LONG2018 ) summary ( modele2_mgps )
# ADRF
alphas<- coef ( modele2_mgps ) result_mass0<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_mass0 [t] + alphas [3]*( grille1_mass0 [t]^2) + alphas [4]* mgps0_matrix [rta ,t]+ alphas [5]* grille1_mass0 [t] * mgps0_matrix [rta ,t]
}
result_mass0 [t] = 1/N * sum ( list_rta_mgps ) }
result_mgps0<- result_mass0
result_grille_deriv<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_mass0 [t] + alphas [4]*mgps0_deriv_matrix [rta ,t]+
alphas [5]*( grille1_mass0 [t]*mgps0_deriv_matrix [rta ,t] + mgps0_matrix [rta ,t])
}
result_grille_deriv [t] = 1/N * sum ( list_rta_mgps ) }
deriv0 <- result_grille_deriv
##--- Bootstrap ---
vars_drf_LONG_mgps <- c(' MEDIA_COST_B',
' MEDIA_COST_C', 'P_POP_4564 ', 'p_const2011_2016 ',
'CRP3_SCORE_STD ', 'WEEKNUM_fac ', 'SOUM_lag1 ',
'MEDIA_lag1_A')
formule1_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =
' + '))
rtas<- unique( DATA_CLUST1_LONG2018$GEO_RTA )
formule1_log <- paste ('newvar ~', paste ( vars_drf_LONG_mgps , collapse = ' + ')) iter<- 1000
result_boot <- list () result_boot_deriv<- list () grille<- list ()
for(i in 1: iter ){ print (i)
rta_sample<- sample (rtas , size =120)
DATA_SAMPLE<- DATA_CLUST1_LONG2018 %>% filter ( GEO_RTA %in% rta_sample )
DATA_SAMPLE_0 <- DATA_SAMPLE %>% filter ( MEDIA_COST_A != 0) %>% mutate ( WEEKNUM
_fac = as. factor ( WEEKNUM ))
summary ( DATA_SAMPLE_0$MEDIA_COST_A)
modele1_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "), data = DATA_SAMPLE_0)
# estimation du mgps
grille1_LONG_sample0 <- quantile ( DATA_SAMPLE_0$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
# estimation de l' esperance
L <- length ( grille1_LONG_sample0 ) N<- nrow ( DATA_SAMPLE_0)
mu_i_0<- 1/( predict . glm ( modele1_mgps , newdata = DATA_SAMPLE ))
phi0 <- summary ( modele1_mgps )$dispersion # 0.22
var_est_i <- phi0 * (mu_i_0^2)
# Calcul de l' estimation du gps comme densite de probabilite
ka <-1/phi0
teta_i0 <- mu_i_0 / ka
mgps_sample0<-dgamma ( DATA_SAMPLE$MEDIA_COST_A, scale = teta_i0 , shape = ka )
# creation d' une nouvelle var qui prend 1 quand MEDIA_COST_A=0
DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( newvar = ifelse ( MEDIA_COST_A == 0, 1, 0))
DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( WEEKNUM_fac = DATA_SAMPLE$WEEKNUM_fac )
#on veut predire la probabilite que newvar =1, MEDIA_COST_A=0
model_mgps <- glm (
data = DATA_SAMPLE , formula = formule1_log ,
family = binomial ( link = " logit ") )
DATA_SAMPLE <- DATA_SAMPLE %>% mutate (
pi_hat = round ( predict ( model_mgps , type = " response "), 10) , #pi_hat r e p r s e n t e la p r o b a b i l i t que phonecost est nulle
WEIGHT = ifelse ( newvar == 1, pi_hat , (1 - pi_hat )))
# Calcul de r_hat
DATA_SAMPLE <- DATA_SAMPLE %>% mutate ( mgps_0= mgps_0, r_hat = ifelse ( newvar ==1 , WEIGHT , WEIGHT*mgps_0))
grille1_sample0 <- quantile ( DATA_SAMPLE$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )
grille [[i]]<- grille1_sample0 L <- length ( grille1_sample0 ) N<- nrow ( DATA_SAMPLE )
mgps0_matrix = matrix ( nrow =N, ncol =L) mgps0_matrix [ ,1] = DATA_SAMPLE$pi_hat
for(j in 2:L) {
mgps0_matrix [,j] <- (1 - DATA_SAMPLE$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ ka))) * exp (- grille1_sample0 [j] / teta_i0 ) *( grille1_sample0 [j ]^( ka -1) ) }
# Derivee de r_hat
mgps0_deriv_matrix = matrix ( nrow =N, ncol =L) mgps0_deriv_matrix [ ,1] = rep (0)
mgps0_deriv_matrix [,j] <- (1 - DATA_SAMPLE$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ka)))* ((( -1 / teta_i0)*exp (-( grille1_sample0 [j]) / teta_i0 )*(( grille1_sample0 [j]) ^(ka -1) )) + ((ka -1)*(( grille1_sample0 [j]) ^(ka -2) )*exp (-( grille1_sample0 [j]) / teta_i0)))
}
modele2_mgps <- lm( SOUM ~ MEDIA_COST_A +I( MEDIA_COST_A ^2 )+ r_hat + MEDIA_
COST_A*r_hat , data = DATA_SAMPLE ) alphas<- coef ( fit2_mgps )
result_sample0<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_sample0 [t] + alphas [3]*( grille1_sample0 [t]^2) + alphas [4]* mgps0_matrix [rta ,t]+ alphas [5]* grille1_sample0 [t] * mgps0_matrix [rta ,t]
}
result_sample0 [t] = 1/N * sum ( list_rta_mgps ) }
result_boot [[i]]<- result_sample0
# Derivee de l' ADRF
result_sample_deriv<- rep (0, L)
for(t in 1:L){
list_rta_mgps <- rep (0, N)
for( rta in 1:N){
list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_sample0 [t] + alphas [4]*mgps0_deriv_matrix [rta ,t]+
alphas [5]*( grille1_sample0 [t]*mgps0_deriv_matrix [rta ,t] + mgps0_matrix [rta ,t])
}
result_sample_deriv [t] = 1/N * sum ( list_rta_mgps ) }
result_boot_deriv [[i]] <- result_sample_deriv }
## Courbes dose - reponse de la figure 4.1 #1) Methode Hirano - Imbens
method1 <- bind_cols ( result_boot_hi) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_hi [[1]])
#2) Methode Moodie - Stephens , avec imputation
method2 <- bind_cols ( result_boot_ms) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_ms [[1]])
#3) Methode Moodie - Stephens , traitement avec masse a 0
method3<- bind_cols ( result_boot ) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille [[1]])
ggplot ( data = method1 , aes (x = MEDIA_COST_A)) +
geom_ribbon ( aes ( ymin = ic1 , ymax = ic2 , fill = 'Hirano - Imbens '), alpha = 0.7) + geom_point ( aes (y= SOUM , color = 'Hirano - Imbens ')) +
geom_ribbon ( data = method2 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodies -
Stephens , avec imputation '), alpha = 0.7) +
geom_point ( data = method2 , aes (y= SOUM , color = 'Moodies - Stephens , avec imputation '))+
geom_ribbon ( data = method3 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodie -
Stephens , traitement avec masse a 0'), alpha = 0.7) +
geom_point ( data = method3 , aes (y= SOUM , color = 'Moodie - Stephens , traitement
avec masse a 0'))
## Courbes de derivees de la figure 4.2 #1) Methode Hirano - Imbens
method1 <- bind_cols ( result_boot_hi_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_hi [[1]])
#2) Methode Moodie - Stephens , avec imputation
method2 <- bind_cols ( result_boot_ms_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_ms [[1]])
#3) Methode Moodie - Stephens , traitement avec masse a 0
method3 <- bind_cols ( result_boot_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),
ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille [[1]])
ggplot ( data = method1 , aes (x = MEDIA_COST_A)) +
geom_ribbon ( aes ( ymin = ic1 , ymax = ic2 , fill = 'Hirano - Imbens '), alpha = 0.7) + geom_point ( aes (y= SOUM , color = 'Hirano - Imbens ')) +
geom_ribbon ( data = method2 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodies -
Stephens , avec imputation '), alpha = 0.7) +
geom_point ( data = method2 , aes (y= SOUM , color = 'Moodies - Stephens , avec imputation '))+
geom_ribbon ( data = method3 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodie -
Stephens , traitement avec masse a 0'), alpha = 0.7) +
geom_point ( data = method3 , aes (y= SOUM , color = 'Moodie - Stephens , traitement
avec masse a 0'))+
# ajout de la droite associee a la methode des SCMMs
geom_line ( data = data . frame (x= grille1_LONG , y= 0.015) , aes (x, y, color = ' SCMM '))
Bibliographie
Lecocq Aurélie, Ammi Mehdi, and Bellarbre Élodie. Le score de propension : Un guide métho- dologique pour les recherches expérimentales et quasi expérimentales en éducation. Mesure et Évaluation en Éducation, 37(2) :69–100, 2014.
Peter C Austin. An introduction to propensity score methods for reducing the effects of confounding in observational studies. Multivariate Behavioral Research, 46(3) :399–424, 2011.
Peter C Austin. Assessing covariate balance when using the generalized propensity score with quantitative or continuous exposures. Statistical Methods in Medical Research, 28(5) : 1365–1377, 2019.
Michela Bia and Alessandra Mattei. A stata package for the estimation of the dose-response function through adjustment for the generalized propensity score. The Stata Journal, 8(3) : 354–373, 2008.
Alex Bryson, Richard Dorsett, Susan Purdon, et al. The use of propensity score matching in the evaluation of active labour market policies. Technical report, London School of Economics and Political Science, LSE Library, 2002.
Rajeev Dehejia. Practical propensity score matching : a reply to Smith and Todd. Journal of Econometrics, 125(1-2) :355–364, 2005.
Rajeev H Dehejia and Sadek Wahba. Causal effects in nonexperimental studies : Reevaluating the evaluation of training programs. Journal of the American Statistical Association, 94 (448) :1053–1062, 1999.
B Efron. Bootstrap methods : Another look at the jackknife. Annals of Statistics, 7(1) :1–26, 1979.
Bradley Efron and David Feldman. Compliance as an explanatory variable in clinical trials. Journal of the American Statistical Association, 86(413) :9–17, 1991.
Shenyang Guo and Mark W Fraser. Propensity score analysis : Statistical methods and appli- cations, volume 11. SAGE publications, 2014.
Miguel Angel Hernán. A definition of causal effect for epidemiological research. Journal of Epidemiology & Community Health, 58(4) :265–271, 2004.
Keisuke Hirano and Guido W Imbens. The propensity score with continuous treatments. Ap- plied Bayesian Modeling and Causal Inference from Incomplete-Data Perspectives, 226164 : 73–84, 2004.
Paul W Holland. Statistics and causal inference. Journal of the American Statistical Associa- tion, 81(396) :945–960, 1986.
Kosuke Imai and David A Van Dyk. Causal inference with general treatment regimes : Ge- neralizing the propensity score. Journal of the American Statistical Association, 99(467) : 854–866, 2004.
Kosuke Imai, Gary King, and Elizabeth A Stuart. Misunderstandings between experimentalists and observationalists about causal inference. Journal of the Royal Statistical Society : Series A (Statistics in Society), 171(2) :481–502, 2008.
Guido W Imbens. The role of the propensity score in estimating dose-response functions. Biometrika, 87(3) :706–710, 2000.
Joseph DY Kang, Joseph L Schafer, et al. Demystifying double robustness : A comparison of alternative strategies for estimating a population mean from incomplete data. Statistical Science, 22(4) :523–539, 2007.
Ruth H Keogh, Rhian M Daniel, Tyler J VanderWeele, and Stijn Vansteelandt. Analysis of lon- gitudinal studies with repeated outcome measures : adjusting for time-dependent confoun- ding using conventional methods. American Journal of Epidemiology, 187(5) :1085–1092, 2018.
Jochen Kluve, Hilmar Schneider, Arne Uhlendorff, and Zhong Zhao. Evaluating continuous training programmes by using the generalized propensity score. Journal of the Royal Sta-