-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathCellFie.m
343 lines (319 loc) · 14.6 KB
/
CellFie.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
function[score, score_binary ,taskInfos, detailScoring]=CellFie(data,SampleNumber,ref,param)
% Compute the score associated to each metabolic task listed in taskstructure based on transcriptomic data
%
% USAGE:
% [score, score_binary ,taskInfos, detailScoring]=CellFie(data,SampleNumber,ref,param)
%
% INPUTS:
% data
% .gene cell array containing GeneIDs in the same
% format as model.genes
% .value mRNA expression data structure (genes x samples)associated to each gene metioned in data.gene
% SampleNumber Number of samples
% ref Reference model used to compute the
% metabolic task scores (e.g.,'MT_recon_2_2_entrez.mat')
% OPTIONAL INPUTS:
% param.ThreshType Type of thresholding approach used
% (i.e.,'global' or 'local') (default - local)
% related to the use of a GLOBAL thresholding approach - the threshold value is the same for all the genes
% param.percentile_or_value the threshold can be defined using a value introduced by the user ('value')
% or based on a percentile of the distribution of expression value for all the
% genes and across all samples of your
% dataset ('percentile')
% param.percentile percentile from the distribution of
% expression values for all the genes and across all samples that will be
% used to define the threshold value
% param.value expression value for which a gene is
% considered as active or not (e.g., 5)
%
% related to the use of a LOCAL thresholding approach - the threshold value is different for all the genes
% param.percentile_or_value the threshold can be defined using a value introduced by the user ('value')
% or based on a percentile of the distribution of expression value of a
% specific gene across all samples of your
% dataset ('percentile'-default)
% param.LocalThresholdType option to define the type of local thresholding approach to use
% - 'minmaxmean' (default options )- the threshold for a gene is determined by the mean of expression
% values observed for that gene among all the samples, tissues, or conditions BUT
% the threshold :(i) must be higher or equal to a lower bound and (ii) must be lower
% or equal to an upper bound.
% - 'mean' -the threshold for a gene is defined as the mean expression value
% of this gene across all the samples, tissues, or conditions
% param.percentile_low lower percentile used to define which gene
% are always inactive in the case of use 'MinMaxMean' local thresholding
% approach (default = 25)
% param.percentile_high upper percentile used to define which gene
% are always active in the case of use 'MinMaxMean' local thresholding
% approach (default= 75)
% param.value_low lower expression value used to define which gene
% are always inactive in the case of use 'MinMaxMean' local thresholding
% approach (e.g., 5)
% param.value_high upper expression value used to define which gene
% are always active in the case of use 'MinMaxMean' local thresholding
% approach (e.g., 5)
%
% related to the gene mapping approach used
% param.minSum: instead of using min and max, use min for AND and Sum
% for OR (default: false, i.e. use min)
% OUTPUTS:
% score relative quantification of the activity of a metabolic task in a specific condition
% based on the availability of data for multiple conditions
% score_binary binary version of the metabolic task score
% to determine whether a task is active or inactive in specific
% conditions
% taskInfos Description of the metabolic task assessed
% detailScoring Matrix detailing the scoring
% 1st column = sample ID
% 2nd column = task ID
% 3th column = task score for this sample
% 4th column = task score in binary version for this sample
% 5th column = essential reaction associated to this task
% 6th column = expression score associated to the reaction listed in the 5th column
% 7th column = gene used to determine the expression of the reaction listed in the 5th column
% 8th column = original expression value of the gene listed in the 7th column
%
% .. Authors:
% - Anne Richelle, January 2019
if size(data.value,2) ~= SampleNumber
error('The number of samples defined is not the same as the size of the dataset')
end
if size(data.value,1) ~= length(data.gene)
error('data.value does not have the same number of rows as data.gene')
end
if ~exist('ref','var')
error('The reference model has not been defined - please choose a reference model')
end
if ~exist('param','var')
param.ThreshType='local';
param.percentile_or_value='percentile';
param.LocalThresholdType='minmaxmean';
param.percentile_low=25;
param.percentile_high=75;
end
% f = waitbar(0,'Please wait...');
% waitbar(.15,f,'Loading your data');
%load the info about the task structure
load('taskStructure')
taskInfos=struct2cell(taskStructure);
taskInfos=taskInfos';
taskInfos(:,5:end)=[];
%load the reference model in matlab
load(ref);
%% Depending on the model, load the list of reactions associated with the task
% All these files have the following format
% essentialRxnsbyTask_name_of_model and are located in essentialRxns folder
load(strcat('essentialRxns/essentialRxnsbyTask_',ref));
% check that at least part of list of gene provided are in the model loaded
ID_model=[];
gene_notInModel=[];
for i=1:length(data.gene)
if isempty(find(strcmp(data.gene{i},model.genes)))
gene_notInModel(end+1)=i;
else
tmpid=find(strcmp(data.gene{i},model.genes)==1);
ID_model(end+1)=tmpid(1);
end
end
% introduce a warning in the webtool about how many of the genes provided are
% actually mapped to the model
if isempty(gene_notInModel)
display('All genes provided in data are included in the reference model')
else
display([num2str(length(gene_notInModel)),' genes provided are not included in the reference model:'])
data.gene(gene_notInModel)
end
% remove the gene and associated value provided by the user in data that are not in the model
data.gene(gene_notInModel)=[];
if SampleNumber==1
data.value(gene_notInModel)=[];
else
data.value(gene_notInModel,:)=[];
end
% get the threshold value and the histogram for the complete dataset and
% print a figure
if SampleNumber>1
linData = reshape(data.value,numel(data.value),1);
else
linData=data.value;
end
linData(linData==0)=[];
% definition of the thresholds
if strcmp(param.ThreshType,'global') && strcmp(param.percentile_or_value,'percentile')
display('RUN - global: percentile')
l_global = (prctile(log10(linData),param.percentile));
data.ths=10^l_global;
elseif strcmp(param.ThreshType,'global') && strcmp(param.percentile_or_value,'value')
display('RUN - global: value')
data.ths=param.value;
elseif strcmp(param.ThreshType,'local') && strcmp(param.LocalThresholdType,'mean')
display('RUN - local: mean')
elseif strcmp(param.ThreshType,'local') && strcmp(param.LocalThresholdType,'minmaxmean')&& strcmp(param.percentile_or_value,'percentile')
display('RUN - local: minmaxmean: percentile')
l_high = (prctile(log10(linData),param.percentile_high));
data.ths_high=10^l_high;
l_low = (prctile(log10(linData),param.percentile_low));
data.ths_low=10^l_low;
elseif strcmp(param.ThreshType,'local') && strcmp(param.LocalThresholdType,'minmaxmean')&& strcmp(param.percentile_or_value,'value')
display('RUN - local: minmaxmean: value')
data.ths_high=param.value_high;
data.ths_low=param.value_low;
else
error('No analysis triggered')
end
%% Compute the threshold(s) depending on the approach used
Gene_score=[];
switch param.ThreshType
case 'local'
if strcmp(param.LocalThresholdType,'mean')
%the threshold for each gene is equal to its mean value over
%all the samples
threshold=mean(data.value,2)';
else
threshold=[];
for i=1:length(data.gene)
expressionValue=data.value(i,:);
if mean(expressionValue)>=data.ths_high
threshold(i)=data.ths_high;
else
threshold(i)=max(mean(expressionValue),data.ths_low);
end
end
end
% every single gene is associated to an expression score
for i=1:SampleNumber
Gene_score(:,i)=5.*log(1+(data.value(:,i)./threshold'));
end
case 'global'
Gene_score=5.*log(1+(data.value./data.ths));
end
% Mapping of the expression data to the model
expression.gene=data.gene;
expression.Rxns=[];
expression.gene_used=[];
expression.count=[];
minSum = false;
%waitbar(.25,f,'Load GPR parse');
%% load parsedGPR for each model
load(strcat('parsedGPR/parsedGPR_',ref));
%%parsedGPR = GPRparser(model,minSum);%code to compute the parsed GPR using
%%cobratoolbox
%waitbar(.45,f,'Mapping of the expression data to the model');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% run all samples at the same time
expression.value=Gene_score;
% Find wich genes in expression data are used in the model
[gene_id, gene_expr] = findUsedGenesLevels_all(model,expression);
% Link the gene to the model reactions
[expressionRxns, gene_used] = selectGeneFromGPR_all(model, gene_id, gene_expr, parsedGPR, minSum);
for zz = 1:SampleNumber
gene_all=[];
for j=1:length(gene_used(:,zz))
if ~isempty(gene_used{j,zz})
gene_all(end+1)=str2num(gene_used{j,zz}{1});
end
end
countGene = tabulate(gene_all);
count=[];
for k=1:length(gene_used(:,zz))
if ~isempty(gene_used{k,zz})
tmp=countGene(str2num(gene_used{k,zz}{1}),2);
count(k)=tmp;
else
count(k)=0;
end
end
expression.count=[expression.count count'];
end
expression.Rxns = expressionRxns;
expression.gene_used = gene_used;
%% Compute the score
expressionRxns=expression.Rxns;
significance=1./expression.count;
significance(isinf(significance))=0;
ScorebyTask=[];
ScorebyTask_binary=[];
%waitbar(.75,f,'Compute the task activity score');
for i=1:size(taskInfos,1)
if ~isempty(essentialRxns{i})
rxns=essentialRxns{i};
rxnID=findRxnIDs(model,rxns);
rxnID(rxnID==0)=[];
if ~isempty(rxnID)
expValue=expressionRxns(rxnID,:);
signValue=significance(rxnID,:);
% if no gene is associated with one of the reaction -
% remove the reactions from the count
if ~isempty(find(sum(expValue,2)==-SampleNumber))
signValue(find(sum(expValue,2)==-SampleNumber),:)=[];
expValue(find(sum(expValue,2)==-SampleNumber),:)=[];
end
if ~isempty(expValue)
if size(expValue,1)>1
ScorebyTask(i,:)=sum(expValue.*signValue)./size(expValue,1);
Val=sum(expValue)./size(expValue,1);
ID_up=find(Val>=5*log(2));
ScorebyTask_binary(i,:)=zeros(1,SampleNumber);
ScorebyTask_binary(i,ID_up)=1;
else
ScorebyTask(i,:)=expValue.*signValue;
ID_up=find(expValue>=5*log(2));
ScorebyTask_binary(i,:)=zeros(1,SampleNumber);
ScorebyTask_binary(i,ID_up)=1;
end
else
ScorebyTask(i,:)=-1.*ones(1,SampleNumber);
ScorebyTask_binary(i,:)=-1.*ones(1,SampleNumber);
end
else
ScorebyTask(i,:)=-1.*ones(1,SampleNumber);
ScorebyTask_binary(i,:)=-1.*ones(1,SampleNumber);
end
else
ScorebyTask(i,:)=-1.*ones(1,SampleNumber);
ScorebyTask_binary(i,:)=-1.*ones(1,SampleNumber);
end
end
detailScoring={};
tmpSampNum = {};
for j=1:SampleNumber
tmpSampNum{j} = int2str(j);
incR.(['incR',tmpSampNum{j}])=1;
end
for i=1:size(taskInfos,1)
if ~isempty(essentialRxns{i})
rxns=essentialRxns{i};
rxnID=findRxnIDs(model,rxns);
rxnID(rxnID==0)=[];
for j=1:SampleNumber
if ~isempty(rxnID)
for k=1:length(rxnID)
%1st column = sample ID
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+1}=j;
%2nd column = task ID
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+2}=i;
%3th column = task score for this sample
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+3}=ScorebyTask(i,j);
%4th column = task score in binary version for this sample
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+4}=ScorebyTask_binary(i,j);
%5th column = essential reaction associated to this
%task
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+5}=rxns(k);
%6th column = expression score associated to the
%reaction listed in the 5th column
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+6}=expression.Rxns(rxnID(k),j);
%7th column = gene used to determine the expression of the
%reaction listed in the 5th column
geneName=expression.gene_used(rxnID(k),j);
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+7}=geneName;
%8th column = original expression value of the gene
%listed in the 7th column
detailScoring{incR.(['incR',tmpSampNum{j}]),((j-1)*8)+8}=data.value((strcmp(data.gene,geneName{1})),j); % 8.64
incR.(['incR',tmpSampNum{j}])=incR.(['incR',tmpSampNum{j}])+1;
end
end
end
end
score=ScorebyTask;
score_binary=ScorebyTask_binary;
end
end
%close(f)