Single Sample Directional Gene Set Analysis (ssdGSA)

Load package for use

library(ssdGSA)
library(GSVA)

Load Example Data Sets

Let’s first load an example data matrix which contains gene expression information with ENTREZ ID as its row names. For the provided sample data matrix in this package, we only have ten samples in total.

Data_matrix <- ssdGSA::data_matrix_entrezID
knitr::kable(head(round(Data_matrix, 3)))

	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2268	4.429	3.761	3.706	4.760	2.680	3.937	4.397	4.278	3.831	3.844
572	4.616	4.760	4.944	4.497	4.573	4.575	4.709	5.152	5.142	4.936
952	2.359	0.311	1.894	3.600	-0.438	2.163	3.499	1.738	-0.415	2.279
292	7.752	8.537	8.201	7.817	7.530	7.964	8.026	7.926	8.287	7.682
9020	5.614	5.109	5.317	5.598	5.300	5.241	5.264	5.449	5.157	5.477
6376	5.920	5.540	5.005	6.520	5.321	5.629	5.904	5.381	5.165	5.610

Alternatively, we can load a data matrix which has ENSEMBL ID as its row names, such as the following example data matrix.

knitr::kable(head(round(ssdGSA::data_matrix, 3)))

	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
ENSG00000000938	4.429	3.761	3.706	4.760	2.680	3.937	4.397	4.278	3.831	3.844
ENSG00000002330	4.616	4.760	4.944	4.497	4.573	4.575	4.709	5.152	5.142	4.936
ENSG00000004468	2.359	0.311	1.894	3.600	-0.438	2.163	3.499	1.738	-0.415	2.279
ENSG00000005022	7.752	8.537	8.201	7.817	7.530	7.964	8.026	7.926	8.287	7.682
ENSG00000006062	5.614	5.109	5.317	5.598	5.300	5.241	5.264	5.449	5.157	5.477
ENSG00000006210	5.920	5.540	5.005	6.520	5.321	5.629	5.904	5.381	5.165	5.610

We can use the function transform_ensembl_2_entrez in the proposed ssdGSA package to transform ENSEMBL ID into ENTREZ ID.

Data_matrix <- transform_ensembl_2_entrez(ssdGSA::data_matrix)
knitr::kable(head(round(Data_matrix, 3)))

	S1	S2	S3	S4	S5	S6	S7	S8	S9	S10
2268	4.429	3.761	3.706	4.760	2.680	3.937	4.397	4.278	3.831	3.844
572	4.616	4.760	4.944	4.497	4.573	4.575	4.709	5.152	5.142	4.936
952	2.359	0.311	1.894	3.600	-0.438	2.163	3.499	1.738	-0.415	2.279
292	7.752	8.537	8.201	7.817	7.530	7.964	8.026	7.926	8.287	7.682
9020	5.614	5.109	5.317	5.598	5.300	5.241	5.264	5.449	5.157	5.477
6376	5.920	5.540	5.005	6.520	5.321	5.629	5.904	5.381	5.165	5.610

Now the data matrix has ENTREZ ID as its row names. Please make sure that the data matrix you input in the main function ssdGSA and ssdGSA_individual has gene ENTREZ ID as its row names.

Then, let’s load an example gene sets, which is a list containing gene ENTREZ ID in each gene set. In this example gene sets, there are in total 10 gene sets (corresponding to 10 pathways in this case), and we display 3 gene sets.

#> $TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY
#>   [1] "7535"  "3725"  "10892" "5163"  "940"   "5535"  "5533"  "5534"  "10298"
#>  [10] "29851" "7409"  "3551"  "1326"  "6885"  "207"   "208"   "4893"  "10000"
#>  [19] "7410"  "63928" "10725" "4790"  "4793"  "4792"  "56924" "1493"  "4690" 
#>  [28] "4794"  "23533" "1437"  "920"   "4776"  "1432"  "6300"  "5970"  "4775" 
#>  [37] "3845"  "925"   "926"   "919"   "8440"  "9020"  "57144" "915"   "5058" 
#>  [46] "916"   "917"   "23624" "3937"  "5335"  "4773"  "4772"  "3932"  "998"  
#>  [55] "10125" "6655"  "5063"  "1147"  "5894"  "7124"  "84433" "5062"  "387"  
#>  [64] "5777"  "5133"  "5601"  "5600"  "5605"  "5588"  "5603"  "5604"  "5609" 
#>  [73] "9402"  "2885"  "5595"  "3586"  "5788"  "6654"  "5594"  "3265"  "8517" 
#>  [82] "7006"  "2932"  "868"   "867"   "3702"  "27040" "3458"  "3558"  "959"  
#>  [91] "8503"  "1019"  "5532"  "1739"  "5530"  "5290"  "5291"  "5293"  "8915" 
#> [100] "2534"  "3565"  "2353"  "3567"  "5294"  "5295"  "10451" "5296"  "11261"
#> 
#> $TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY
#>   [1] "3725"   "941"    "942"    "6772"   "54106"  "6348"   "6416"   "6352"  
#>   [9] "4615"   "6351"   "3929"   "8737"   "10333"  "3551"   "23643"  "1326"  
#>  [17] "6885"   "207"    "208"    "929"    "1513"   "10000"  "6696"   "9641"  
#>  [25] "4790"   "4792"   "23533"  "7100"   "114609" "1432"   "6300"   "5970"  
#>  [33] "7189"   "7187"   "54472"  "6373"   "23118"  "51284"  "51311"  "841"   
#>  [41] "3627"   "1147"   "148022" "7124"   "3442"   "3441"   "3440"   "8772"  
#>  [49] "3439"   "51135"  "3576"   "5601"   "3593"   "5602"   "5600"   "5605"  
#>  [57] "5606"   "3592"   "5603"   "5604"   "5609"   "5608"   "3451"   "3452"  
#>  [65] "3443"   "3444"   "3445"   "3446"   "3447"   "3448"   "3449"   "5595"  
#>  [73] "5879"   "5594"   "3661"   "8517"   "5599"   "3456"   "3454"   "3455"  
#>  [81] "353376" "3553"   "3654"   "958"    "8503"   "29110"  "5290"   "4283"  
#>  [89] "5291"   "5293"   "7098"   "7099"   "10454"  "7096"   "2353"   "7097"  
#>  [97] "5294"   "3663"   "5295"   "3569"   "5296"   "3665"  
#> 
#> $NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY
#>   [1] "3725"   "10818"  "805"    "5163"   "801"    "27330"  "7161"   "4217"  
#>   [9] "4214"   "673"    "9252"   "4215"   "3551"   "25"     "207"    "399694"
#>  [17] "208"    "4893"   "10000"  "25970"  "4790"   "4793"   "4792"   "4794"  
#>  [25] "23533"  "27018"  "1432"   "6300"   "5970"   "3845"   "53358"  "7189"  
#>  [33] "51806"  "4803"   "627"    "4804"   "10019"  "1445"   "596"    "7529"  
#>  [41] "814"    "815"    "816"    "2309"   "817"    "818"    "8660"   "6272"  
#>  [49] "810"    "1398"   "10603"  "1399"   "5335"   "5336"   "7534"   "396"   
#>  [57] "808"    "7533"   "397"    "7532"   "7531"   "998"    "6655"   "5781"  
#>  [65] "5894"   "6464"   "8767"   "387"    "572"    "51135"  "5580"   "5601"  
#>  [73] "5602"   "9500"   "5600"   "5605"   "5603"   "5604"   "581"    "5609"  
#>  [81] "5607"   "6196"   "2889"   "6197"   "2885"   "5595"   "8986"   "5598"  
#>  [89] "57498"  "5879"   "6654"   "5594"   "6195"   "10971"  "3265"   "5599"  
#>  [97] "11108"  "2932"   "9261"   "5663"   "4909"   "4908"   "3656"   "3654"  
#> [105] "5906"   "8503"   "5908"   "11213"  "7157"   "5290"   "5291"   "5293"  
#> [113] "2549"   "25759"  "356"    "468"    "3667"   "5294"   "4915"   "10782" 
#> [121] "5295"   "4914"   "8471"   "5296"   "4916"   "163688"

This is a list of gene sets with gene set names as component names, and each component is a vector of gene ENTREZ ID.

Also, we need to load the direction matrix which contains directionality information for each gene from summary statistics.

gene	ES	pval
10000	0.200	0.323
10019	1.096	0.272
10125	0.422	0.264
1019	-1.030	0.000
10235	2.008	0.067
1026	0.884	0.075

This direction matrix contains directionality information for each gene, such as effect size (ES), p value, false positive rate (FDR) from summary statistics. Each row of the matrix is for one gene, and there should be at least two columns (with the 1st column containing gene ENTREZ ID, and 2nd column containing directionality information of that gene). For the provided sample direction matrix in this package we include three columns: gene, ES and pval, corresponding to gene ENTREZ ID, effect size and p value. Also note that when direction matrix is missing, scores from traditional single sample gene set analysis would be calculated by the proposed ssdGSA package.

Functions

ssdGSA

This function is to do single sample directional gene set analysis, which inherits the standard gene set variation analysis(GSVA) method, but also provides the option to use summary statistics from any analysis (disease vs healthy, LS vs NL, etc..) input to define the direction of gene sets used for directional gene set score calculation for a given disease. Note that this function is specific for using group weighted scores.

Below are the default parameters for ssdGSA. You can change them to modify your output. Use help(ssdGSA) to learn more about the parameters.

ssdGSA(Data = Data_matrix,
       Gene_sets = Gene_sets,
       Direction_matrix = Direction_matrix, 
       GSA_weight = "group_weighted",
       GSA_weighted_by = "sum.ES", #options are: "sum.ES", "avg.ES", "median.ES"
       GSA_method = "gsva", #"options are: "gsva", "ssgsea", "zscore", "avg.exprs", and "median.exprs"
       min.sz = 1,  # GSVA parameter
       max.sz = 2000, # GSVA parameter
       mx.diff = TRUE # GSVA parameter
)
#> Warning messages: 
#> 100% (3/3) of gene sets have information missing in the data matrix for at least one gene; 
#> 0% (0/3) of gene sets have information missing in the direction matrix for at least one gene.
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY: 
#>  6.48% of genes (7/108) in this gene set have information missing in the data matrix; 
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY: 
#>  18.63% of genes (19/102) in this gene set have information missing in the data matrix; 
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY: 
#>  2.38% of genes (3/126) in this gene set have information missing in the data matrix;
#> Note: Running GSA for 3 gene sets using 'gsva' method with 'group_weighted' weights.
#>                                                         S1         S2
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.8899907 -0.7563572
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.4402918 -0.7308813
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.6600825 -0.4376452
#>                                                           S3        S4
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.11595511 0.9147812
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY     0.02531708 0.8983102
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY  0.32165136 0.5579457
#>                                                          S5          S6
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.8046286  0.67310666
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    -0.0860897 -0.21186968
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY -0.2392317  0.02336175
#>                                                         S7         S8
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.5649402 -0.2881176
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.7599040 -0.2073954
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.0519870 -0.3424723
#>                                                          S9        S10
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.8291629 -0.1599448
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    -1.0423261  0.3924671
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY -0.7903744  0.1108400

A matrix of directional gene set scores from single sample directional gene set analysis when using group weighted scores, with rows corresponding to gene sets and columns corresponding to different samples is returned.

ssdGSA(Data = Data_matrix,
       Gene_sets = Gene_sets,
       Direction_matrix = NULL, 
       GSA_weight = "group_weighted",
       GSA_weighted_by = "sum.ES", #options are: "sum.ES", "avg.ES", "median.ES"
       GSA_method = "gsva", #"options are: "gsva", "ssgsea", "zscore", "avg.exprs", and "median.exprs"
       min.sz = 6,  # GSVA parameter
       max.sz = 2000, # GSVA parameter
       mx.diff = TRUE # GSVA parameter
)
#> Warning messages: 
#> 100% (3/3) of gene sets have information missing in the data matrix for at least one gene.
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY: 
#>  6.48% of genes (7/108) in this gene set have information missing in the data matrix; 
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY: 
#>  18.63% of genes (19/102) in this gene set have information missing in the data matrix; 
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY: 
#>  2.38% of genes (3/126) in this gene set have information missing in the data matrix;
#> Note: Running GSVA for 3 gene sets using 'gsva' method with 'group_weighted' weights.
#>                                                          S1          S2
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.32529011 -0.06270977
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.03361398 -0.22037641
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.05229527  0.26714173
#>                                                           S3          S4
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.06606869  0.08399112
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY     0.02795322  0.24911101
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY -0.03985403 -0.17438679
#>                                                          S5          S6
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.13796232  0.26365975
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.15094708 -0.06247332
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.03404076 -0.08262855
#>                                                           S7          S8
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.05860165 -0.20555643
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY     0.18322508 -0.08882073
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY -0.17181714 -0.10191413
#>                                                          S9        S10
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     -0.2702770 -0.1182854
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    -0.3976409  0.1118144
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY -0.0553835  0.1801999
#> attr(,"geneSets")
#> attr(,"geneSets")$TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY
#>   [1] "7535"  "3725"  "10892" "5163"  "940"   "5533"  "5534"  "10298" "29851"
#>  [10] "7409"  "3551"  "1326"  "6885"  "207"   "208"   "4893"  "10000" "7410" 
#>  [19] "63928" "10725" "4790"  "4793"  "4792"  "56924" "1493"  "4690"  "4794" 
#>  [28] "23533" "920"   "4776"  "1432"  "6300"  "5970"  "4775"  "3845"  "925"  
#>  [37] "926"   "919"   "8440"  "9020"  "915"   "5058"  "916"   "917"   "23624"
#>  [46] "3937"  "5335"  "4773"  "4772"  "3932"  "998"   "10125" "6655"  "5063" 
#>  [55] "1147"  "5894"  "7124"  "84433" "5062"  "387"   "5777"  "5133"  "5601" 
#>  [64] "5600"  "5605"  "5588"  "5603"  "5604"  "5609"  "9402"  "2885"  "5595" 
#>  [73] "3586"  "5788"  "6654"  "5594"  "3265"  "8517"  "7006"  "2932"  "868"  
#>  [82] "867"   "3702"  "27040" "3458"  "959"   "8503"  "1019"  "5532"  "1739" 
#>  [91] "5530"  "5290"  "5291"  "5293"  "8915"  "2534"  "2353"  "5294"  "5295" 
#> [100] "10451" "11261"
#> 
#> attr(,"geneSets")$TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY
#>  [1] "3725"   "941"    "942"    "6772"   "54106"  "6348"   "6416"   "6352"  
#>  [9] "4615"   "6351"   "8737"   "10333"  "3551"   "23643"  "1326"   "6885"  
#> [17] "207"    "208"    "929"    "1513"   "10000"  "6696"   "9641"   "4790"  
#> [25] "4792"   "23533"  "7100"   "114609" "1432"   "6300"   "5970"   "7189"  
#> [33] "7187"   "54472"  "6373"   "23118"  "51284"  "51311"  "841"    "3627"  
#> [41] "1147"   "148022" "7124"   "8772"   "51135"  "3576"   "5601"   "5602"  
#> [49] "5600"   "5605"   "5606"   "5603"   "5604"   "5609"   "5608"   "5595"  
#> [57] "5879"   "5594"   "3661"   "8517"   "5599"   "3454"   "3455"   "3553"  
#> [65] "3654"   "958"    "8503"   "29110"  "5290"   "4283"   "5291"   "5293"  
#> [73] "7098"   "7099"   "10454"  "7096"   "2353"   "7097"   "5294"   "3663"  
#> [81] "5295"   "3569"   "3665"  
#> 
#> attr(,"geneSets")$NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY
#>   [1] "3725"   "10818"  "805"    "5163"   "801"    "27330"  "7161"   "4217"  
#>   [9] "4214"   "673"    "9252"   "4215"   "3551"   "25"     "207"    "399694"
#>  [17] "208"    "4893"   "10000"  "25970"  "4790"   "4793"   "4792"   "4794"  
#>  [25] "23533"  "27018"  "1432"   "6300"   "5970"   "3845"   "53358"  "7189"  
#>  [33] "51806"  "627"    "4804"   "10019"  "1445"   "596"    "7529"   "814"   
#>  [41] "815"    "816"    "2309"   "817"    "818"    "8660"   "6272"   "810"   
#>  [49] "1398"   "10603"  "1399"   "5335"   "5336"   "7534"   "396"    "808"   
#>  [57] "7533"   "397"    "7532"   "7531"   "998"    "6655"   "5781"   "5894"  
#>  [65] "6464"   "8767"   "387"    "572"    "51135"  "5580"   "5601"   "5602"  
#>  [73] "9500"   "5600"   "5605"   "5603"   "5604"   "581"    "5609"   "5607"  
#>  [81] "6196"   "2889"   "6197"   "2885"   "5595"   "8986"   "5598"   "57498" 
#>  [89] "5879"   "6654"   "5594"   "6195"   "10971"  "3265"   "5599"   "11108" 
#>  [97] "2932"   "9261"   "5663"   "4909"   "4908"   "3656"   "3654"   "5906"  
#> [105] "8503"   "5908"   "11213"  "7157"   "5290"   "5291"   "5293"   "2549"  
#> [113] "25759"  "356"    "468"    "3667"   "5294"   "4915"   "10782"  "5295"  
#> [121] "4914"   "4916"   "163688"

Alternatively, when direction matrix is missing, i.e., Direction_matrix = NULL, scores from traditional single sample gene set analysis without directionality information will be calculated and returned.

ssdGSA_individual

This function is to do single sample directional gene set analysis when using individual weighted scores.

Below are the default parameters for ssdGSA_individual. You can change them to modify your output. Use help(ssdGSA_individual) to learn more about the parameters.

ssdGSA_individual(Data = Data_matrix,
                  Gene_sets = Gene_sets,
                  Direction_matrix = Direction_matrix
)
#> Warning messages: 
#> 100% (3/3) of gene sets have information missing in the data matrix for at least one gene; 
#> 0% (0/3) of gene sets have information missing in the direction matrix for at least one gene.
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY: 
#>  6.48% of genes (7/108) in this gene set have information missing in the data matrix; 
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY: 
#>  18.63% of genes (19/102) in this gene set have information missing in the data matrix; 
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY: 
#>  2.38% of genes (3/126) in this gene set have information missing in the data matrix;
#> Note: Running GSA for 3 gene sets using individual weights.
#>                                                         S1        S2        S3
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.8393467 0.7441525 0.7692775
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.9901754 0.7456950 0.9189434
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.5689334 0.5562217 0.5617713
#>                                                         S4        S5        S6
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.8266904 0.6935935 0.8230364
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.9890529 0.7882257 0.9169321
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.5752082 0.5474677 0.5705729
#>                                                         S7        S8        S9
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.8144697 0.7539750 0.7184018
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    1.0088957 0.8382858 0.7202826
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.5698750 0.5602913 0.5584633
#>                                                        S10
#> TCELL.KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY     0.7497758
#> TLR.KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY    0.9084717
#> NEUROTROPHIN.KEGG_NEUROTROPHIN_SIGNALING_PATHWAY 0.5641252

A matrix of directional gene set scores from single sample directional gene set analysis when using individual weighted scores, with rows corresponding to gene sets and columns corresponding to different samples is returned.