# install needed R packages
remotes::update_packages(c("magrittr", "dplyr", "ggplot2", "stringr", "WriteXLS"), upgrade = TRUE)
remotes::install_github(c("dongzhuoer/thesis/r-lib/qGSEA", "dongzhuoer/thesis/r-lib/rGEO"), upgrade = TRUE)

You can run Rscript -e "rmarkdown::render('index.Rmd')" in thesis repo to reproduce this.

Remember to install WenQuanYi Zen Hei font (apt install fonts-wqy-zenhei) for proper rendering of Chinese characters in plot.

1 preparation

Set global theme for ggplot

helper function

2 Figures used in presentation

2.1 GEO数据库现状

Go to https://www.ncbi.nlm.nih.gov/geo/summary/?type=series, following is a snapshot on 2018-05-30.

Expression profiling by array   54,135
Expression profiling by genome tiling array 730
Expression profiling by high throughput sequencing  18,963
Expression profiling by SAGE    238
Expression profiling by MPSS    20
Expression profiling by RT-PCR  587
Expression profiling by SNP array   14

2.3 many slides

Open data-raw/geo-example.xlsx and take screenshots.

2.4 将其他数据库中基因的ID对应到HUGO符号

ensembl_df_to_symbol_df <- 
    . %>% dplyr::mutate('symbol' = hgnc::as_symbol_from_ensembl(!!rlang::sym('Ensembl'))) %>% dplyr::select(-Ensembl)

soft_df <- system.file('extdata/GSE19161_family.soft.gz', package = 'rGEO') %>% 
    {rGEO:::parse_gse_soft(., F)$table} %>% dplyr::select(ID, Ensembl) %>% 
    dplyr::slice(c(1, 2, 446, 3, 231, 4))
soft_df
#> # A tibble: 6 × 2
#>   ID          Ensembl                            
#>   <chr>       <chr>                              
#> 1 121_at      ENSG00000125618                    
#> 2 200003_s_at ENSG00000108107                    
#> 3 209919_x_at ENSG00000100031 /// ENSG00000100121
#> 4 200004_at   ENSG00000110321                    
#> 5 204060_s_at ENSG00000099725 /// ENSG00000183943
#> 6 200006_at   ENSG00000116288

(soft_df_single   <- soft_df %>% filter(!str_detect(Ensembl, '///')))
#> # A tibble: 4 × 2
#>   ID          Ensembl        
#>   <chr>       <chr>          
#> 1 121_at      ENSG00000125618
#> 2 200003_s_at ENSG00000108107
#> 3 200004_at   ENSG00000110321
#> 4 200006_at   ENSG00000116288
(soft_df_multiple <- soft_df %>% filter( str_detect(Ensembl, '///')))
#> # A tibble: 2 × 2
#>   ID          Ensembl                            
#>   <chr>       <chr>                              
#> 1 209919_x_at ENSG00000100031 /// ENSG00000100121
#> 2 204060_s_at ENSG00000099725 /// ENSG00000183943

(soft_df_melt1 <- soft_df_multiple %>% slice(1) %>% hgnc::melt_map('ID', 'Ensembl', '[^\\w]+'))
#> # A tibble: 2 × 2
#>   ID          Ensembl        
#>   <chr>       <chr>          
#> 1 209919_x_at ENSG00000100031
#> 2 209919_x_at ENSG00000100121
(soft_df_melt2 <- soft_df_multiple %>% slice(2) %>% hgnc::melt_map('ID', 'Ensembl', '[^\\w]+'))
#> # A tibble: 2 × 2
#>   ID          Ensembl        
#>   <chr>       <chr>          
#> 1 204060_s_at ENSG00000099725
#> 2 204060_s_at ENSG00000183943

(chip_df_melt1 <- soft_df_melt1 %>% ensembl_df_to_symbol_df())
#> # A tibble: 2 × 2
#>   ID          symbol
#>   <chr>       <chr> 
#> 1 209919_x_at GGT1  
#> 2 209919_x_at GGTLC2
(chip_df_melt2 <- soft_df_melt2 %>% ensembl_df_to_symbol_df())
#> # A tibble: 2 × 2
#>   ID          symbol
#>   <chr>       <chr> 
#> 1 204060_s_at PRKY  
#> 2 204060_s_at PRKX

(chip_df_cast1 <- chip_df_melt1 %>% hgnc::cast_map('ID', 'symbol'))
#> # A tibble: 1 × 2
#>   ID          symbol         
#>   <chr>       <chr>          
#> 1 209919_x_at GGT1 /// GGTLC2
(chip_df_cast2 <- chip_df_melt2 %>% hgnc::cast_map('ID', 'symbol'))
#> # A tibble: 1 × 2
#>   ID          symbol       
#>   <chr>       <chr>        
#> 1 204060_s_at PRKY /// PRKX

(chip_df_single <- soft_df_single %>% ensembl_df_to_symbol_df())
#> # A tibble: 4 × 2
#>   ID          symbol
#>   <chr>       <chr> 
#> 1 121_at      PAX8  
#> 2 200003_s_at RPL28 
#> 3 200004_at   EIF4G2
#> 4 200006_at   PARK7
(chip_df_multiple <- bind_rows(chip_df_cast1, chip_df_cast2))
#> # A tibble: 2 × 2
#>   ID          symbol         
#>   <chr>       <chr>          
#> 1 209919_x_at GGT1 /// GGTLC2
#> 2 204060_s_at PRKY /// PRKX

chip_df <- soft_df %>% hgnc::melt_map('ID', 'Ensembl', '[^ENSG\\d]+') %>%
    ensembl_df_to_symbol_df %>% hgnc::cast_map('ID', 'symbol') %>% 
    dplyr::select('Probe Set ID' = 1, 'Gene Symbol' = 2) %>% slice(c(1, 2, 6, 3, 5, 4))
chip_df
#> # A tibble: 6 × 2
#>   `Probe Set ID` `Gene Symbol`  
#>   <chr>          <chr>          
#> 1 121_at         PAX8           
#> 2 200003_s_at    RPL28          
#> 3 209919_x_at    GGT1 /// GGTLC2
#> 4 200004_at      EIF4G2         
#> 5 204060_s_at    PRKY /// PRKX  
#> 6 200006_at      PARK7

Then open data-raw/mapping-probe-to-symbol.xlsx and take screenshots.

3 Figures used in thesis document

3.1 图 4: rGEO的表现

Again, consult me for how to get gds_result.txt.

All accession in lem4::platform_human are downloaded into data-raw/GPL-html/, so gpls and thus qGSEA::gpl both include following subset of the whole geo, since lem4 ensure that lem4::platform_human includes it.

# NOT run
geo <- rGEO.data::read_summary(system.file('extdata/gds_result.txt', package = 'rGEO.data'))

qGSEA_label <- tribble(
    ~old_type, ~type,
    'known', '可以处理',
    'sequence', '仅探针序列', 
    'unknown', '无法处理',
    'non-coding', '非编码基因'
) %T>% print

sum_to_qualify_qGSEA <- . %>% filter(type != 'non_human') %>%
    mutate(type = ifelse(type %in% c('circRNA', 'miRNA'), 'non-coding', type)) %>% 
    mutate(type = ifelse(type %in% c('unknown', 'non-coding', 'sequence'), type, 'known')) %>% 
    group_by(type) %>% summarise(n = sum(n)) %>% ungroup() %>% 
    left_join(qGSEA_label, ., by = c('old_type' = 'type')) %>% 
    select(-old_type)
rGEO.data::gpl_metas[1:6] %>% count(type) %>% ungroup() %>% sum_to_qualify_qGSEA() %T>% print

types <- rGEO.data::platform$Accession %>% parallel::mclapply(rGEO:::fake_platform, rGEO.data::gpl_metas) %>% parallel::mclapply(rGEO::guess_platform_type)
types %>% lapply(is.null) %>% unlist() %>% {!.} %>% sum
rGEO
gpl_sum <- qGSEA::gpl %>% count(type) %>% ungroup() %>% sum_to_qualify_qGSEA() %T>% print
#> # A tibble: 4 x 3
#>   old_type   type           n
#>   <chr>      <chr>      <int>
#> 1 known      可以处理    2626
#> 2 sequence   仅探针序列   986
#> 3 unknown    无法处理     783
#> 4 non-coding 非编码基因   473

gse_sum <- geo %>% filter(species == 'Homo sapiens', str_detect(accession, 'GSE')) %>% 
    filter(str_count(platform, 'GPL') == 1L) %>% 
    transmute(accession = str_extract(platform, 'GPL\\d+')) %>% count(accession) %>% 
    left_join(qGSEA::gpl) %>% group_by(type) %>% summarise(n = sum(n)) %>% ungroup() %>% 
    sum_to_qualify_qGSEA() %T>% print
#> # A tibble: 4 x 2
#>   type           n
#>   <chr>      <int>
#> 1 可以处理   17753
#> 2 仅探针序列   113
#> 3 无法处理     160
#> 4 非编码基因    57

3.2 图 6: 各个集合在不同数据集中的整体表现

Let’s see how to get the plot, I will show it step by step because the final plot is not so straight-forward.

From now on, we refer to FDR < 0.25 as “significant”.

Let’s rearrange the x axis to see the trend better.

Let’s use line to make it more obvious.

You might already noticed that c3.mir and c4.cgn are quite different. And c7.all is also unusual (it a large collection, you should compare it with c2.cgp and c5.all).

Now, let’s turn the barplot (second one) to a histgram.

Finally, let’s turn it into a density plot.

Epilogue: the trend remains same regardless significant level.

  1. FDR < 0.25 it the most obvious
  2. FDR < 0.1 is still recognizable
  3. FDR < 0.05 is hard to distinguish c7.all from c1.all at first glance
  4. FDR < 0.01 you have to find the difference carefully.