Covariables selon la méthode utilisée - Inférence causale pour mesurer le retour sur les invest

Modèle de score de propension

Méthode Covariables

Hirano-Imbens MEDIA_COST*_B ; MEDIA_COST*_C ; P_POP_4564 ;

p_const2011_2016 ; CRP3_SCORE_STD

- SCMM MEDIA_COST_B ; MEDIA_COST_C ; P_POP_4564 ;

- Moodie-Stephens, avec imputation

p_const2011_2016 ; CRP3_SCORE_STD ; - Moodie-Stephens, trai-

tement avec masse à 0

Annexe B

Code R

#1) Analyse agregee

#a) On moyenne nos donnees sur les 9 semaines en question

DATA_MEAN0 <- DATA %>%

#on filtre sur l' annee 2018 , les semaines de 8 a 16

filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>% group_by( GEO_RTA ) %>% summarise_at( vars ( vars_media ), # vars_media nos variables d' investissement

funs ( mean (.) )) %>% as. data . frame ()

#b) Classification K- moyennes

K_means_vars <-c(" MEDIA_COST_B","P_marie ","P_immigr ","P_mater_fr") MEDIA_A_kmean <- kmeans (

x = Kmeans_data [,K_means_vars ], centers = 5,

nstart = 50)

MEDIA_A_kmean$size # 136

MEDIA_A_kmean$withinss/MEDIA_A_kmean$size

k = which . min ( MEDIA_A_kmean$withinss / MEDIA_A_kmean$size )

# ** Dataset final forme de 136 RTA homogenes

# Definir les groupes de RTA HIGH et LOW en terme de MEDIA_A

ngroup <- floor ( MEDIA_A_kmean$size [k] / 3) DATA_CLUST1 <- DATA_MEAN0 %>%

filter ( CLUSTER == k) %>% arrange ( MEDIA__COST_A) %>% mutate (

Group = c( rep ('LOW ', ngroup ), rep ('MIDDLE ', phone_kmean$size [k] - 2*ngroup ), rep ('HIGH ', ngroup )),

TREATED = ifelse ( Group == 'LOW ', 0,

#c) Application de la methode Hirano - Imbens #i) Modele de score de propension

vars_drf_1 <-c('MEDIA \_COST*\_B',

'MEDIA \_COST*\_C ','P_POP_4564 ',

'p_const2011_2016 ', 'CRP3_SCORE_STD ')

formule_drf_1 <- paste ('MEDIA \_COST*\_B ~', paste ( vars_drf_1, collapse =

' + '))

modele1_gamma <- glm (as. formula ( formule_drf_1) , family = Gamma ( link = " inverse "), data = DATA_CLUST1 )

summary ( modele1_gamma ) # parametre de dispersion 0.0326 # Estimation du GPS

#1. Definition de la grille de longueur 93 pour la variable MEDIA \_COST*\_A

grille1 <- quantile ( DATA_CLUST1$MEDIA \_COST*\_A, probs = seq (0.03 , .95 , by = 0.01) )

#2. Estimation de l' esperance

N <- nrow ( DATA_CLUST1 ) L <- length ( grille1 )

mu_i1 <- 1/( predict . glm ( modele1_gamma ))

#3. Estimation de la variance

phi <- summary ( modele1_gamma )$dispersion var_est_i <- phi * (mu_i ^2)

#4. Calcul de l' estimation du GPS comme densite de probabilite #mu_i = ka*teta_i , var_i = ka*( teta_i ^2)

ka <-1/phi

teta_i <- mu_i1 / ka

# GPS

GPS<- dgamma ( DATA_CLUST1$MEDIA \_COST*\_A, scale = teta_i, shape = ka )

#ii) Modele de SOUM*

modele2_gamma <- lm( SOUM* ~ MEDIA \_COST*\_$A$+ I( MEDIA \_COST*\_$A$ ^2 )+ GPS +

I( GPS ^2) + MEDIA \_COST*\_$A$ * GPS , data = DATA_CLUST1 ) summary ( modele2_gamma )

alphas <- modele2_gamma$coefficients

# iii ) Estimation de la fonction la dose - reponse et sa derivee #on definit d' abord une matrice GPS

gps1_gamma_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

gps1_gamma_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i ^ka))) * exp (- grille1 [j]

/ teta_i ) *( grille1 [j ]^( ka -1) ) }

# ADRF

result_grille <- rep (0, L)

for(t in 1:L){

list_rta <- rep (0, N)

for( rta in 1:N){

list_rta [ rta ] = alphas [1] + alphas [2]*grille1 [t] + alphas [3]*grille1 [t]^2 + alphas [4]*gps1_gamma_matrix [rta ,t] + alphas [5]*gps1_gamma_matrix [rta ,t

]^2 + alphas [6]*grille1 [t]*gps1_gamma_matrix [rta ,t]} result_grille [t] = 1/N * sum ( list_rta )

}

hi_clust<- result_grille

# Derivee du GPS

gps_deriv<- ( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( DATA_CLUST1

$MEDIA \_COST*\_A) / teta_i )*(( DATA_CLUST1$MEDIA \_COST*\_A)^(ka -1) )) + ((ka -1)*(( DATA_CLUST1$MEDIA \_COST*\_A)^(ka -2) )*exp (-( DATA_CLUST1$MEDIA \_

COST*\_A) / teta_i)))

gps_deriv_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

gps_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( grille1 [j]) / teta_i)*(( grille1 [j]) ^(ka -1) )) +

((ka -1)*(( grille1 [j]) ^(ka -2) )*exp (-( grille1 [j]) / teta_i)))}

# Derivee de l' ADRF

result_hi_derivee <- rep (0, L)

for(t in 1:L){

list_rta <- rep (0, N)

for( rta in 1:N){

list_rta [ rta ] = alphas [2] + 2*alphas [3]*grille1 [t] + alphas [4]*gps_deriv_

matrix [rta ,t] + 2*alphas [5]*gps1_gamma_matrix [rta ,t]*gps_deriv_matrix [rta ,

t] +

alphas [6]*( grille1 [t]*gps_deriv_matrix [rta ,t] + gps1_gamma_matrix [rta ,t])} result_hi_derivee [t] = 1/N * sum ( list_rta )}

hi_derivee<-result_hi_derivee

#d) Validation de la propriete de balance du GPS

# Repartir MEDIA \_COST*\_A par tertiles en 3 intervalles g

DATA_BALANCED1 <- DATA_CLUST1 %>% arrange ( MEDIA \_COST*\_A) %>%

mutate ( GROUP = ifelse ( MEDIA \_COST*\_$A$ < quantile ( MEDIA \_COST*\_A, .33) , 'g1 '

. ifelse ( MEDIA \_COST*\_$A$ > quantile ( MEDIA \_COST*\_$A$, .66) , ' g3 ', 'g2 '))) %>% as. data . frame ()

#i) Avant ajustement pour le GPS ( unadjusted )

# 5 variables x 3 groupes = 15 t- tests

ttest_unadjusted <- DATA_BALANCED1 %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g1"], .[ GROUP != "g1"])$statistic )) %>%

mutate ( GROUP = 'g1_MEDIA \_A') %>% bind_rows (

DATA_BALANCED1 %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g2"], .[ GROUP != "g2"])$statistic )) %>%

mutate ( GROUP = 'g2_MEDIA \_A') ) %>%

bind_rows (

DATA_BALANCED1 %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g3"], .[ GROUP != "g3"])$statistic )) %>%

mutate ( GROUP = 'g3_MEDIA \_A') ) %>%

gather (-GROUP , key = Xi , value = ttest ) %>% spread ( key = GROUP , value = ttest ) %>% mutate_at( vars (-Xi), list (~round (., 2))) %>% as_tibble ()

ttest_unadjusted #on obtient le tableau des statistiques t d' egalite des moyennes avant ajustement pour le GPS

#ii) Apres ajustement pour le GPS ( adjusted )

# a) on calcule le traitement m d i a n par intervalle g

MEDIA \_A_median <- DATA_BALANCED1 %>% group_by( GROUP ) %>% summarise ( median = median ( MEDIA \_COST*\_A)) %>% .$median

# b) on calcule un vecteur median de GPS par intervalle g

DATA_BALANCED1 <- DATA_BALANCED1 %>% mutate (

gps_median1 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [1]/

teta_i) * ( MEDIA \_A_median [1]^{ ka -1}) ,

gps_median2 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [2]/

teta_i) * ( MEDIA \_A_median [2]^{ ka -1}) ,

gps_median3 = (1 / ( gamma (ka) * teta_i^ka) ) * exp (- MEDIA \_A_median [3]/

teta_i) * ( MEDIA \_A_median [3]^{ ka -1}) )

# c) on separe ces GPS en 4 blocs

library( forcats ) nblocks <- 4

DATA_BALANCED1 <- DATA_BALANCED1 %>% mutate (

gps_block_from_m1 = cut ( gps_median1 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )),

gps_block_from_m2 = cut ( gps_median2 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )),

gps_block_from_m3 = cut ( gps_median3 , nblocks ) %>% lvls_revalue ( paste0 (' block_', 1: nblocks )))

# d) t- test entre les blocs (un bloc contre les autres )

ttest_adjusted <- DATA_BALANCED1 %>% group_by( gps_block_from_m1) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g1"], .[ GROUP != "g1"])$statistic )) %>%

left_join (count( DATA_BALANCED1 , gps_block_from_m1)) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g1_MEDIA \_$A$') %>%

bind_rows (

DATA_BALANCED1 %>%

group_by( gps_block_from_m2) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g2"], .[ GROUP != "g2"])$statistic )) %>%

left_join (count( DATA_BALANCED1 , gps_block_from_m2)) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g2_MEDIA \_$A$')

) %>% bind_rows (

DATA_BALANCED1 %>%

group_by( gps_block_from_m3) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~t. test (.[ GROUP == "g3"], .[ GROUP != "g3"])$statistic )) %>%

ungroup () %>%

left_join (count( DATA_BALANCED1 , gps_block_from_m3)) %>%

summarise_at( vars ( one_of( vars_drf_1)), list (~sum (. * n) / sum (n))) %>% mutate ( GROUP = 'g3_MEDIA \_$A$')

) %>%

gather (-GROUP , key = Xi , value = ttest ) %>% spread ( key = GROUP , value = ttest ) %>% mutate_at( vars (-Xi), list (~round (., 2)))

ttest_adjusted #on obtient le tableau des statistiques t d' egalite des moyennes apres ajustement pour le GPS .

##--- Bootstrap ---

vars_drf_1 <- c('MEDIA \_COST*\_B ','MEDIA \_COST*\_C', 'P_POP_4564 ', 'p_ const2011_2016 ', 'CRP3_SCORE_STD ')

formule_drf_1 <- paste ('MEDIA \_COST*\_A ~', paste ( vars_drf_1, collapse = ' + '))

iter<- 1000

result_boot_hi <- list () result_boot_hi_deriv<- list () grille_hi<- list ()

for(i in 1: iter ){ print (i)

rta_sample<- sample (rtas , size =120)

DATA_SAMPLE<- DATA_CLUST1 %>% filter ( GEO_RTA %in% rta_sample )

modele1_gps <- glm (as. formula ( formule_drf_1) , family = Gamma ( link = " inverse "), data = DATA_SAMPLE )

# estimation du GPS

grille1_sample <- quantile ( DATA_SAMPLE$MEDIA \_COST*\_$A$, probs = seq (0.03 , .95 , by = 0.01) )

# estimation de l' esperance

L <- length ( grille1_sample ) N<- nrow ( DATA_SAMPLE )

mu_i <- 1/( predict . glm ( modele1_gps , newdata = DATA_SAMPLE ))

# estimation de la variance

phi <- summary ( modele1_gps )$dispersion var_est_i <- phi * (mu_i ^2)

ka <-1/phi

teta_i <- mu_i / ka

gps_sample<-dgamma ( DATA_SAMPLE$MEDIA \_COST*\_A, scale = teta_i, shape = ka )

gps1_gamma_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

gps1_gamma_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i ^ka))) * exp (- grille1

_sample [j] / teta_i ) *( grille1_sample [j ]^( ka -1) ) }

gps1_gamma = apply( gps1_gamma_matrix , 2, function(x) mean (x))

# Derivee du GPS

gps_deriv_sample<- ( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1 / teta_i)*exp (-( DATA_SAMPLE$MEDIA \_COST*\_A) / teta_i )*(( DATA_SAMPLE$MEDIA \_COST*\_A)^(ka -1) )) + ((ka -1)*(( DATA_SAMPLE$MEDIA \_COST*\_A)^(ka -2) )*exp (-( DATA_SAMPLE$

MEDIA \_COST*\_A) / teta_i)))

gps_deriv_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

gps_deriv_matrix [,j] <-

( 1 /( gamma (ka)* ( teta_i ^ka)))* ((( -1/teta_i)*exp (-( grille1_sample [j])

/ teta_i)*(( grille1_sample [j]) ^(ka -1) )) + ((ka -1)*(( grille1_sample [j]) ^(ka -2) )*

exp (-( grille1_sample [j]) / teta_i))) }

DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( GPS = gps_sample )

modele2_gamma <- lm( SOUM* ~ MEDIA \_COST*\_A + I( MEDIA \_COST*\_$A$ ^2 )+ GPS + I( GPS ^2) + MEDIA \_COST*\_A * GPS , data = DATA_SAMPLE )

# ADRF

alphas<- coef ( modele2_gamma ) result_sample <- rep (0, L)

for(t in 1:L){

list_rta_gps <- rep (0, N)

for( rta in 1:N){

list_rta_gps [ rta ] = alphas [1] + alphas [2]*grille1_sample [t] +

alphas [3]*grille1_sample [t]^2 + alphas [4]*gps1_gamma_matrix [rta ,t] + alphas [5]*gps1_gamma_matrix [rta ,t]^2 + alphas [6]*grille1_sample [t]*

gps1_gamma_matrix [rta ,t] }

result_sample [t] = 1/N * sum ( list_rta_gps ) }

result_boot_hi [[i]]<- result_sample grille_hi [[i]] <- grille1_sample

# Derivee de l' ADRF

result_sample_deriv<- rep (0, L)

for(t in 1:L){

list_rta_gps <- rep (0, N)

for( rta in 1:N){

list_rta_gps [ rta ] = alphas [2] + 2*alphas [3]*grille1_sample [t] + alphas [4]*gps_deriv_matrix [rta ,t] +

2*alphas [5]*gps1_gamma_matrix [rta ,t]*

gps_deriv_matrix [rta ,t] + alphas [6]*( grille1_sample [t]*

gps_deriv_matrix [rta ,t] + gps1_gamma_matrix [rta ,t]) }

result_sample_deriv [t] = 1/N *sum ( list_rta_gps ) }

result_boot_hi_deriv [[i]] <- result_sample_deriv }

#2) Analyse longitudinale

##On garde les valeurs des 9 semaines , annee 2018

DATA_LONG2018 <- DATA %>% filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>%

filter ( GEO_NBF_2016 > 0, YEAR == 2018 , WEEKNUM %in% 8:16) %>% mutate ( URBAIN = ifelse ( grepl (0, GEO_RTA ) == T, 0, 1)) %>% mutate_at(

vars ( vars_media ), funs (. / GEO_NBF_2016) )%>%

select (YEAR , WEEKNUM , GEO_RTA , GEO_POP , GEO_NBF_2016 , URBAIN , vars_media ) DATA_LONG2018 <- DATA_LONG2018 %>% #on la merge avec statcan

left_join ( STATCAN , by = c('GEO_RTA ', 'GEO_NBF_2016 ', 'URBAIN '))%>%

na.omit() %>% as. data . frame ()

# ** Cluster de 1224: 136 rta repetee 9 fois

LONG <- filter ( DATA_MEAN0 , CLUSTER == k) %>% select ( GEO_RTA )

DATA_CLUST1_LONG2018<- DATA_LONG2018 %>% filter ( GEO_RTA %in% LONG$GEO_RTA )

# ** Introduire les variables retardees

DATA_CLUST1_LONG2018 = DATA_CLUST1_LONG2018 %>% arrange ( GEO_RTA , WEEKNUM ) %>%

group_by( GEO_RTA ) %>% mutate (

SOUM_lag1 = lag (SOUM , n = 1, default = 0) ,

MEDIA_lag1_A = lag ( MEDIA_COST_A, n = 1, default = 0) ) %>% ungroup ()

DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( WEEKNUM_fac = as. factor ( WEEKNUM )) # WEEKNUM en facteur

# 2.1) Approche - SCMM

DATA_CLUST_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( MEDIA_COST_A = ifelse ( MEDIA_COST_A == 0, 0.1 , MEDIA_COST_A)) %>%

mutate ( WEEKNUM_fac = as. factor ( WEEKNUM ))

#i) Modele pour le score de propension

vars1_SCMM_fac <- c('MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ',

'p_const2011_2016 ','CRP3_SCORE_STD ','WEEKNUM_fac ',

'MEDIA_lag1_A', 'SOUM_lag1 ')

formule_gee_fac <- paste ('MEDIA \_COST \_A ~', paste ( vars1_SCMM_fac , collapse =

' + '))

mod1 <- lm(as. formula ( formule_gee_fac ), data = DATA_CLUST_LONG2018 ) summary ( mod1 )

gps_fac<- mod1$fitted . values

#ii) modele pour SOUM

vars2_SCMM <- c('MEDIA_COST_A ', 'MEDIA_lag1_A','SOUM_lag1 ', 'gps_fac ',' MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ', 'p_const2011_2016 ',

'CRP3_SCORE_STD ')

formule <- paste ('SOUM ~', paste ( vars2_SCMM , collapse = ' + ')) mod2<- lm(as. formula ( formule ), data = DATA_CLUST_LONG2018 )

#a) Moodie_Stephens , avec imputation #i) Modele pour le score de propension

vars_drf_LONG_mgps <- c('MEDIA_COST_B', 'MEDIA_COST_C', 'P_POP_4564 ',

'p_const2011_2016 ','CRP3_SCORE_STD ','WEEKNUM_fac ',

'MEDIA_lag1_A', 'SOUM_lag1 ')

formule_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse = ' + '))

modele_gamma_mgps <- glm (as. formula ( formule_mgps ), family = Gamma ( link = " inverse "), data = DATA_CLUST_LONG2018 )

summary ( modele_gamma_mgps )

# Estimation du mgps

#1. Definition de la grille de longueur 93 pour la variable MEDIA_COST_A

grille1_LONG <- quantile ( DATA_CLUST_LONG2018$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

#2. estimation de l' e s p r a n c e

L <- length ( grille1_LONG ) N<- nrow ( DATA_CLUST_LONG2018 )

mu_i1 <- 1/( predict . glm ( modele_gamma_mgps ))

#3. estimation de la variance

phi1 <- summary ( modele_gamma_mgps )$dispersion var_est_i <- phi1 * (mu_i1 ^2)

#3. Calcul de l' estimation du gps comme d e n s i t de p r o b a b i l i t #mu_i = ka*teta_i , var_i = ka*( teta_i ^2)

ka <-1/phi1

teta_i1 <- mu_i1 / ka

#4. MGPS

# mgps1<-dgamma ( DATA_CLUST_LONG2018$MEDIA_COST_A, scale = teta_i1 , shape = ka )

DATA_CLUST_LONG2018 <- DATA_CLUST_LONG2018 %>% mutate ( mgps1 = dgamma ( DATA_

CLUST_LONG2018$MEDIA_COST_A, scale = teta_i1 , shape = ka )) mgps1_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

mgps1_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i1 ^ka))) * exp (- grille1_LONG [j]

/ teta_i1 ) *( grille1_LONG [j ]^( ka -1) ) }

#5. Deriv_MGPS

mgps1_deriv<- ( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*exp (-( DATA_

CLUST_LONG2018$MEDIA_COST_A) / teta_i1 )*(( DATA_CLUST_LONG2018$MEDIA_COST_

A)^(ka -1) )) +(( ka -1)*(( DATA_CLUST_LONG2018$MEDIA_COST_A)^(ka -2) )

*exp (-( DATA_CLUST_LONG2018$MEDIA_COST_A) / teta_i1))) mgps1_deriv_matrix = matrix ( nrow =N, ncol =L)

mgps1_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*

exp (-( grille1_LONG [j]) / teta_i1 )*(( grille1_LONG [j]) ^(ka -1) )) +

((ka -1)*(( grille1_LONG [j]) ^(ka -2) )*exp (-( grille1_LONG [j]) / teta_i1))) }

#ii) Modele pour SOUM

mod_mgps <- lm( SOUM ~ MEDIA_COST_A + I( MEDIA_COST_A ^ 2) + mgps1 + MEDIA_

COST_A*mgps1 , data = DATA_CLUST_LONG2018 ) summary ( mod_mgps )

# ADRF

alphas<- coef ( mod_mgps )

result_grille_mgps<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_LONG [t] + alphas [3]* I( grille1_LONG [t]^2) + alphas [4]*mgps1_matrix [rta ,t]+ alphas [5]*grille1_LONG [t]*mgps1_matrix [rta ,t]

}

result_grille_mgps [t] = 1/N * sum ( list_rta_mgps ) }

result_mgps1<- result_grille_mgps

# Derivee de l' ADRF

result_grille_deriv<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_LONG [t] + alphas [4]*mgps1_deriv_matrix [rta ,t]+

alphas [5]*( grille1_LONG [t]*mgps1_deriv_matrix [rta ,t] + mgps1_matrix [rta ,t

]) }

result_grille_deriv [t] = 1/N * sum ( list_rta_mgps ) }

deriv1 <- result_grille_deriv

##--- Bootstrap ---

formule1_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =

' + '))

rtas<- unique( DATA_CLUST_LONG2018$GEO_RTA )

vars_drf_LONG_mgps <- c( 'MEDIA_COST_B', 'MEDIA_COST_C','P_POP_4564 ', 'p_ const2011_2016 ', 'CRP3_SCORE_STD ','WEEKNUM_fac ', 'SOUM_lag1 ','MEDIA_lag1_A ')

iter<- 1000

result_boot_ms <- list () result_boot_ms_deriv<- list () grille_ms<- list ()

for(i in 1: iter ){ print (i)

rta_sample<- sample (rtas , size =120)

DATA_SAMPLE<- DATA_CLUST_LONG2018 %>% filter ( GEO_RTA %in% rta_sample )

modele_gamma_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "), data = DATA_CLUST_LONG2018 )

# estimation du mgps

grille1_LONG_sample <- quantile ( DATA_SAMPLE$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

# estimation de l' esperance

L <- length ( grille1_LONG_sample ) N<- nrow ( DATA_SAMPLE )

mu_i1<- 1/( predict . glm ( modele_gamma_mgps , newdata = DATA_SAMPLE ))

# estimation de la variance

phi1 <- summary ( modele_gamma_mgps )$dispersion # 0.22

var_est_i <- phi1 * (mu_i1 ^2)

# Calcul de l' estimation du gps comme densite de probabilite

ka <-1/phi1

teta_i1 <- mu_i1 / ka

mgps1_SAMPLE<-dgamma ( DATA_SAMPLE$MEDIA_COST_A, scale = teta_i1 , shape = ka ) DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( mgps1 = mgps1_SAMPLE )

grille_ms [[i]] <- grille1_LONG_sample L <- length ( grille1_LONG_sample ) N<- nrow ( DATA_SAMPLE )

mgps1_matrix = matrix ( nrow =N, ncol =L)

for(j in 1:L) {

mgps1_matrix [,j] <- ( 1 /( gamma (ka)* ( teta_i1 ^ka))) * exp (- grille1_LONG_

sample [j] / teta_i1 ) *( grille1_LONG_sample [j ]^( ka -1) ) }

# Derivee du MGPS

mgps1_deriv<- ( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1)*exp (-( DATA_

SAMPLE$MEDIA_COST_A) / teta_i1 )*(( DATA_SAMPLE$MEDIA_COST_A)^(ka -1) )) + (( ka -1)*(( DATA_SAMPLE$MEDIA_COST_A)^(ka -2) )*exp (-( DATA_SAMPLE$MEDIA_COST_A)

/ teta_i1)))

mgps1_deriv_matrix = matrix ( nrow =N, ncol =L)

mgps1_deriv_matrix [,j] <-( 1 /( gamma (ka)* ( teta_i1 ^ka)))* ((( -1 / teta_i1 )*exp (-( grille1_LONG_sample [j]) / teta_i1 )*(( grille1_LONG_sample [j]) ^(ka -1) )) + ((ka -1)*(( grille1_LONG_sample [j]) ^(ka -2) )*exp (-( grille1_LONG_

sample [j]) / teta_i1))) }

mod1_mgps <- lm( SOUM ~ MEDIA_COST_A + I( MEDIA_COST_A ^ 2) + mgps1 + MEDIA_

COST_A*mgps1 , data = DATA_SAMPLE )

# ADRF

alphas<- coef ( mod1_mgps ) result_sample1<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_LONG_sample [t] + alphas [3]*( grille1_LONG_sample [t]^2) + alphas [4]* mgps1_matrix [rta ,t]+ alphas [5]* grille1_LONG_sample [t] * mgps1_matrix [rta ,t]

}

result_sample1 [t] = 1/N * sum ( list_rta_mgps ) }

result_boot_ms [[i]]<- result_sample1

# Derivee de l' ADRF

result_sample_deriv_ms<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_LONG_sample [t] + alphas [4]*mgps1_deriv_matrix [rta ,t]+

alphas [5]*( grille1_LONG_sample [t]*mgps1_deriv_matrix [rta ,t] + mgps1_

matrix [rta ,t]) }

result_sample_deriv_ms[t] = 1/N * sum ( list_rta_mgps ) }

result_boot_ms_deriv [[i]] <- result_sample_deriv_ms }

# b) Moodie - Stephens , traitement avec masse a 0 # i) Calcul du MGPS pour le traitement non nul

DATA_CLUST1_LONG2018_0 <- DATA_CLUST1_LONG2018 %>% filter ( MEDIA_COST_A != 0) %>% mutate ( WEEKNUM_fac = as. factor ( WEEKNUM ))

formule1_mgps <- paste (' MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =

' + '))

modele1_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "

), data = DATA_CLUST1_LONG2018_0) summary ( modele1_mgps )

# estimation du mgps

grille1_LONG_0 <- quantile ( DATA_CLUST1_LONG2018_0$ MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

L <- length ( grille1_LONG_0) N<- nrow ( DATA_CLUST1_LONG2018_0)

mu_i_0<- 1/( predict . glm ( modele1_mgps , newdata = DATA_CLUST1_LONG2018 )) phi0 <- summary ( modele1_mgps )$dispersion

var_est_i <- phi0 * (mu_i_0^2) ka <-1/phi0

teta_i0 <- mu_i_0 / ka

mgps_0<-dgamma ( DATA_CLUST1_LONG2018$MEDIA_COST_A, scale = teta_i0 , shape = ka )

#ii) Modele de melange # regression logistique

# creation d' une nouvelle var ( newvar ) qui prend 1 quand MEDIA_COST_A = 0

DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( newvar = ifelse ( MEDIA

_COST_A == 0, 1, 0))

DATA_CLUST1_LONG2018<- DATA_CLUST1_LONG2018 %>% mutate ( WEEKNUM_fac = DATA_

CLUST_LONG2018$WEEKNUM_fac )

formule1_mgps <- paste ('newvar ~', paste ( vars_drf_LONG_mgps , collapse = ' + ') )

# on veut predire la probabilite que newvar =1, MEDIA_COST_A = 0

model_mgps <- glm (

data = DATA_CLUST1_LONG2018 , formula = formule1_mgps ,

family = binomial ( link = " logit ") )

summary ( model_mgps )

#r_hat

DATA_CLUST1_LONG2018 <- DATA_CLUST1_LONG2018 %>% mutate (

pi_hat = round ( predict ( model_mgps , type = " response "), 10) , #pi_hat represente la probabilite que MEDIA_COST_A est nulle

DATA_CLUST1_LONG2018 <- DATA_CLUST1_LONG2018 %>% mutate ( mgps_0= mgps_0, r_hat = ifelse ( newvar ==1 , WEIGHT , WEIGHT*mgps_0))

# MGPS

grille1_mass0 <- quantile ( DATA_CLUST1_LONG2018$ MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

L <- length ( grille1_mass0 ) N<- nrow ( DATA_CLUST1_LONG2018 )

mgps0_matrix = matrix ( nrow =N, ncol =L)

mgps0_matrix [ ,1] = DATA_CLUST1_LONG2018$pi_hat

for(j in 2:L) {

mgps0_matrix [,j] <- (1 - DATA_CLUST1_LONG2018$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ka))) * exp (- grille1_mass0 [j] / teta_i0 ) *( grille1_mass0 [j ]^( ka -1) )

}

# Derivee de r_hat

mgps0_deriv_matrix = matrix ( nrow =N, ncol =L) mgps0_deriv_matrix [ ,1] = rep (0)

for(j in 2:L) {

mgps0_deriv_matrix [,j] <- (1 - DATA_CLUST1_LONG2018$pi_hat ) * ( 1 /( gamma ( ka)* ( teta_i0 ^ka)))* ((( -1 / teta_i0)*exp (-( grille1_mass0 [j]) / teta_i0 )

*(( grille1_mass0 [j]) ^(ka -1) )) + ((ka -1)*(( grille1_mass0 [ j]) ^(ka -2) )*exp (-( grille1_mass0 [j]) / teta_i0)))

}

# iii ) Modele de SOUM

modele2_mgps <- lm( SOUM ~ MEDIA_COST_A +I( MEDIA_COST_A ^2 )+ r_hat + MEDIA_

COST_A*r_hat , data = DATA_CLUST1_LONG2018 ) summary ( modele2_mgps )

# ADRF

alphas<- coef ( modele2_mgps ) result_mass0<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_mass0 [t] + alphas [3]*( grille1_mass0 [t]^2) + alphas [4]* mgps0_matrix [rta ,t]+ alphas [5]* grille1_mass0 [t] * mgps0_matrix [rta ,t]

}

result_mass0 [t] = 1/N * sum ( list_rta_mgps ) }

result_mgps0<- result_mass0

result_grille_deriv<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_mass0 [t] + alphas [4]*mgps0_deriv_matrix [rta ,t]+

alphas [5]*( grille1_mass0 [t]*mgps0_deriv_matrix [rta ,t] + mgps0_matrix [rta ,t])

}

result_grille_deriv [t] = 1/N * sum ( list_rta_mgps ) }

deriv0 <- result_grille_deriv

##--- Bootstrap ---

vars_drf_LONG_mgps <- c(' MEDIA_COST_B',

' MEDIA_COST_C', 'P_POP_4564 ', 'p_const2011_2016 ',

'CRP3_SCORE_STD ', 'WEEKNUM_fac ', 'SOUM_lag1 ',

'MEDIA_lag1_A')

formule1_mgps <- paste ('MEDIA_COST_A ~', paste ( vars_drf_LONG_mgps , collapse =

' + '))

rtas<- unique( DATA_CLUST1_LONG2018$GEO_RTA )

formule1_log <- paste ('newvar ~', paste ( vars_drf_LONG_mgps , collapse = ' + ')) iter<- 1000

result_boot <- list () result_boot_deriv<- list () grille<- list ()

for(i in 1: iter ){ print (i)

rta_sample<- sample (rtas , size =120)

DATA_SAMPLE<- DATA_CLUST1_LONG2018 %>% filter ( GEO_RTA %in% rta_sample )

DATA_SAMPLE_0 <- DATA_SAMPLE %>% filter ( MEDIA_COST_A != 0) %>% mutate ( WEEKNUM

_fac = as. factor ( WEEKNUM ))

summary ( DATA_SAMPLE_0$MEDIA_COST_A)

modele1_mgps <- glm (as. formula ( formule1_mgps ), family = Gamma ( link = " inverse "), data = DATA_SAMPLE_0)

# estimation du mgps

grille1_LONG_sample0 <- quantile ( DATA_SAMPLE_0$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

# estimation de l' esperance

L <- length ( grille1_LONG_sample0 ) N<- nrow ( DATA_SAMPLE_0)

mu_i_0<- 1/( predict . glm ( modele1_mgps , newdata = DATA_SAMPLE ))

phi0 <- summary ( modele1_mgps )$dispersion # 0.22

var_est_i <- phi0 * (mu_i_0^2)

# Calcul de l' estimation du gps comme densite de probabilite

ka <-1/phi0

teta_i0 <- mu_i_0 / ka

mgps_sample0<-dgamma ( DATA_SAMPLE$MEDIA_COST_A, scale = teta_i0 , shape = ka )

# creation d' une nouvelle var qui prend 1 quand MEDIA_COST_A=0

DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( newvar = ifelse ( MEDIA_COST_A == 0, 1, 0))

DATA_SAMPLE<- DATA_SAMPLE %>% mutate ( WEEKNUM_fac = DATA_SAMPLE$WEEKNUM_fac )

#on veut predire la probabilite que newvar =1, MEDIA_COST_A=0

model_mgps <- glm (

data = DATA_SAMPLE , formula = formule1_log ,

family = binomial ( link = " logit ") )

DATA_SAMPLE <- DATA_SAMPLE %>% mutate (

pi_hat = round ( predict ( model_mgps , type = " response "), 10) , #pi_hat r e p r s e n t e la p r o b a b i l i t que phonecost est nulle

WEIGHT = ifelse ( newvar == 1, pi_hat , (1 - pi_hat )))

# Calcul de r_hat

DATA_SAMPLE <- DATA_SAMPLE %>% mutate ( mgps_0= mgps_0, r_hat = ifelse ( newvar ==1 , WEIGHT , WEIGHT*mgps_0))

grille1_sample0 <- quantile ( DATA_SAMPLE$MEDIA_COST_A, probs = seq (0.03 , .95 , by = 0.01) )

grille [[i]]<- grille1_sample0 L <- length ( grille1_sample0 ) N<- nrow ( DATA_SAMPLE )

mgps0_matrix = matrix ( nrow =N, ncol =L) mgps0_matrix [ ,1] = DATA_SAMPLE$pi_hat

for(j in 2:L) {

mgps0_matrix [,j] <- (1 - DATA_SAMPLE$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ ka))) * exp (- grille1_sample0 [j] / teta_i0 ) *( grille1_sample0 [j ]^( ka -1) ) }

# Derivee de r_hat

mgps0_deriv_matrix = matrix ( nrow =N, ncol =L) mgps0_deriv_matrix [ ,1] = rep (0)

mgps0_deriv_matrix [,j] <- (1 - DATA_SAMPLE$pi_hat ) * ( 1 /( gamma (ka)* ( teta_i0 ^ka)))* ((( -1 / teta_i0)*exp (-( grille1_sample0 [j]) / teta_i0 )*(( grille1_sample0 [j]) ^(ka -1) )) + ((ka -1)*(( grille1_sample0 [j]) ^(ka -2) )*exp (-( grille1_sample0 [j]) / teta_i0)))

}

modele2_mgps <- lm( SOUM ~ MEDIA_COST_A +I( MEDIA_COST_A ^2 )+ r_hat + MEDIA_

COST_A*r_hat , data = DATA_SAMPLE ) alphas<- coef ( fit2_mgps )

result_sample0<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [1] + alphas [2]*grille1_sample0 [t] + alphas [3]*( grille1_sample0 [t]^2) + alphas [4]* mgps0_matrix [rta ,t]+ alphas [5]* grille1_sample0 [t] * mgps0_matrix [rta ,t]

}

result_sample0 [t] = 1/N * sum ( list_rta_mgps ) }

result_boot [[i]]<- result_sample0

# Derivee de l' ADRF

result_sample_deriv<- rep (0, L)

for(t in 1:L){

list_rta_mgps <- rep (0, N)

for( rta in 1:N){

list_rta_mgps [ rta ] = alphas [2] + 2*alphas [3]*grille1_sample0 [t] + alphas [4]*mgps0_deriv_matrix [rta ,t]+

alphas [5]*( grille1_sample0 [t]*mgps0_deriv_matrix [rta ,t] + mgps0_matrix [rta ,t])

}

result_sample_deriv [t] = 1/N * sum ( list_rta_mgps ) }

result_boot_deriv [[i]] <- result_sample_deriv }

## Courbes dose - reponse de la figure 4.1 #1) Methode Hirano - Imbens

method1 <- bind_cols ( result_boot_hi) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_hi [[1]])

#2) Methode Moodie - Stephens , avec imputation

method2 <- bind_cols ( result_boot_ms) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_ms [[1]])

#3) Methode Moodie - Stephens , traitement avec masse a 0

method3<- bind_cols ( result_boot ) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille [[1]])

ggplot ( data = method1 , aes (x = MEDIA_COST_A)) +

geom_ribbon ( aes ( ymin = ic1 , ymax = ic2 , fill = 'Hirano - Imbens '), alpha = 0.7) + geom_point ( aes (y= SOUM , color = 'Hirano - Imbens ')) +

geom_ribbon ( data = method2 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodies -

Stephens , avec imputation '), alpha = 0.7) +

geom_point ( data = method2 , aes (y= SOUM , color = 'Moodies - Stephens , avec imputation '))+

geom_ribbon ( data = method3 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodie -

Stephens , traitement avec masse a 0'), alpha = 0.7) +

geom_point ( data = method3 , aes (y= SOUM , color = 'Moodie - Stephens , traitement

avec masse a 0'))

## Courbes de derivees de la figure 4.2 #1) Methode Hirano - Imbens

method1 <- bind_cols ( result_boot_hi_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_hi [[1]])

#2) Methode Moodie - Stephens , avec imputation

method2 <- bind_cols ( result_boot_ms_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille_ms [[1]])

#3) Methode Moodie - Stephens , traitement avec masse a 0

method3 <- bind_cols ( result_boot_deriv ) %>% mutate ( SOUM = apply(. ,1 , mean ),

ic1 = apply(., 1, function(x) quantile (x, probs = 0.05) ), ic2 = apply(., 1, function(x) quantile (x, probs = 0.95) ))%>% mutate ( MEDIA_COST_A= grille [[1]])

ggplot ( data = method1 , aes (x = MEDIA_COST_A)) +

geom_ribbon ( aes ( ymin = ic1 , ymax = ic2 , fill = 'Hirano - Imbens '), alpha = 0.7) + geom_point ( aes (y= SOUM , color = 'Hirano - Imbens ')) +

geom_ribbon ( data = method2 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodies -

Stephens , avec imputation '), alpha = 0.7) +

geom_point ( data = method2 , aes (y= SOUM , color = 'Moodies - Stephens , avec imputation '))+

geom_ribbon ( data = method3 , aes ( ymin = ic1 , ymax = ic2 , fill = 'Moodie -

Stephens , traitement avec masse a 0'), alpha = 0.7) +

geom_point ( data = method3 , aes (y= SOUM , color = 'Moodie - Stephens , traitement

avec masse a 0'))+

# ajout de la droite associee a la methode des SCMMs

geom_line ( data = data . frame (x= grille1_LONG , y= 0.015) , aes (x, y, color = ' SCMM '))

Bibliographie

Lecocq Aurélie, Ammi Mehdi, and Bellarbre Élodie. Le score de propension : Un guide métho- dologique pour les recherches expérimentales et quasi expérimentales en éducation. Mesure et Évaluation en Éducation, 37(2) :69–100, 2014.

Peter C Austin. An introduction to propensity score methods for reducing the effects of confounding in observational studies. Multivariate Behavioral Research, 46(3) :399–424, 2011.

Peter C Austin. Assessing covariate balance when using the generalized propensity score with quantitative or continuous exposures. Statistical Methods in Medical Research, 28(5) : 1365–1377, 2019.

Michela Bia and Alessandra Mattei. A stata package for the estimation of the dose-response function through adjustment for the generalized propensity score. The Stata Journal, 8(3) : 354–373, 2008.

Alex Bryson, Richard Dorsett, Susan Purdon, et al. The use of propensity score matching in the evaluation of active labour market policies. Technical report, London School of Economics and Political Science, LSE Library, 2002.

Rajeev Dehejia. Practical propensity score matching : a reply to Smith and Todd. Journal of Econometrics, 125(1-2) :355–364, 2005.

Rajeev H Dehejia and Sadek Wahba. Causal effects in nonexperimental studies : Reevaluating the evaluation of training programs. Journal of the American Statistical Association, 94 (448) :1053–1062, 1999.

B Efron. Bootstrap methods : Another look at the jackknife. Annals of Statistics, 7(1) :1–26, 1979.

Bradley Efron and David Feldman. Compliance as an explanatory variable in clinical trials. Journal of the American Statistical Association, 86(413) :9–17, 1991.

Shenyang Guo and Mark W Fraser. Propensity score analysis : Statistical methods and appli- cations, volume 11. SAGE publications, 2014.

Miguel Angel Hernán. A definition of causal effect for epidemiological research. Journal of Epidemiology & Community Health, 58(4) :265–271, 2004.

Keisuke Hirano and Guido W Imbens. The propensity score with continuous treatments. Ap- plied Bayesian Modeling and Causal Inference from Incomplete-Data Perspectives, 226164 : 73–84, 2004.

Paul W Holland. Statistics and causal inference. Journal of the American Statistical Associa- tion, 81(396) :945–960, 1986.

Kosuke Imai and David A Van Dyk. Causal inference with general treatment regimes : Ge- neralizing the propensity score. Journal of the American Statistical Association, 99(467) : 854–866, 2004.

Kosuke Imai, Gary King, and Elizabeth A Stuart. Misunderstandings between experimentalists and observationalists about causal inference. Journal of the Royal Statistical Society : Series A (Statistics in Society), 171(2) :481–502, 2008.

Guido W Imbens. The role of the propensity score in estimating dose-response functions. Biometrika, 87(3) :706–710, 2000.

Joseph DY Kang, Joseph L Schafer, et al. Demystifying double robustness : A comparison of alternative strategies for estimating a population mean from incomplete data. Statistical Science, 22(4) :523–539, 2007.

Ruth H Keogh, Rhian M Daniel, Tyler J VanderWeele, and Stijn Vansteelandt. Analysis of lon- gitudinal studies with repeated outcome measures : adjusting for time-dependent confounding using conventional methods. American Journal of Epidemiology, 187(5) :1085–1092, 2018.

Jochen Kluve, Hilmar Schneider, Arne Uhlendorff, and Zhong Zhao. Evaluating continuous training programmes by using the generalized propensity score. Journal of the Royal Sta-

Dans le document Inférence causale pour mesurer le retour sur les investissements publicitaires (Page 59-81)