geckomat/get_enzyme_data/matchKcats.m

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % kcats = matchKcats(model_data,org_name, minAcceptableKCat)
 % Matchs the model EC numbers and substrates to the BRENDA database, to
 % return the corresponding kcats for each reaction.
 %
% INPUT:    Model data structure (generated by getECnumbers.m)
%           minAcceptableKCat [optional] Set this value if you want to
%           replace too low kcats with a minimum value (unit: s^-1)
 % OUTPUTS:  kcats, which contains:
 %           *forw.kcats:   kcat values for the forward reactions (mxn)
 %           *forw.org_s:   Number of matches for organism - substrate in
 %                          forward reaction (mxn)
 %           *forw.rest_s:  Number of matches for any organism - substrate
 %                          in forward reaction (mxn)
 %           *forw.org_ns:  Number of matches for organism - any substrate
 %                          in forward reaction (mxn)
 %           *forw.rest_ns: Number of matches for any organism - any
 %                          substrate in forward reaction (mxn)
 %           *forw.org_sa:  Number of matches for organism - using s.a.
 %                          in forward reaction (mxn)
 %           *forw.rest_sa: Number of matches for any organism - using s.a.
 %                          in forward reaction (mxn)
 %           *back.kcats:   kcat values for the backward reactions (mxn)
 %           *back.org_s:   Number of matches for organism - substrate in
 %                          backwards reaction (mxn)
 %           *back.rest_s:  Number of matches for any organism - substrate
 %                          in backwards reaction (mxn)
 %           *back.org_ns:  Number of matches for organism - any substrate
 %                          in backwards reaction (mxn)
 %           *back.rest_ns: Number of matches for any organism - any
 %                          substrate in backwards reaction (mxn)
 %           *back.org_sa:  Number of matches for organism - using s.a.
 %                          in backwards reaction (mxn)
 %           *back.rest_sa: Number of matches for any organism - using s.a.
 %                          in backwards reaction (mxn)
 %           *tot.queries:  The total amount of ECs matched (1x1)
 %           *tot.org_s:    The amount of ECs matched for the organism & the
 %                          substrate (1x1)
 %           *tot.rest_s:   The amount of ECs matched for any organism & the
 %                          substrate (1x1)
 %           *tot.org_ns:   The amount of ECs matched for the organism & any
 %                          substrate (1x1)
 %           *tot.rest_ns:  The amount of ECs matched for any organism & any
 %                          substrate (1x1)
 %           *tot.org_sa:   The amount of ECs matched for the organism & 
 %                          using s.a. (1x1)
 %           *tot.rest_sa:  The amount of ECs matched for any organism & 
 %                          using s.a. (1x1)
 % 
 % Benjamin J. Sanchez. Last edited: 2016-03-01
 % Ivan Domenzain.      Last edited: 2018-01-16
 % Johan Gustafsson		Last edited: 2021-07-02 Introduced optimizations from GeckoLight
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 function kcats = matchKcats(model_data, org_name, minAcceptableKCat)
 
 if nargin < 3
     minAcceptableKCat = 0;
 end
 
 minAcceptableKCat = minAcceptableKCat * 3600;%convert to per hour
 
 fprintf('Matching kcats...')
 
 %Load BRENDA data:
 [KCATcell, SAcell] = loadBRENDAdata;
 
 %Creates a Structure with KEGG codes for organisms, names and taxonomical
 %distance matrix and extract the organism index in the KEGG struct
 phylDistStruct =  KEGG_struct;
 %Get the KEGG code for the model's organism
 org_index      = find_inKEGG(org_name,phylDistStruct.names);
 %build an index for genus in the phyl dist struct
 %first just extract the genus (i.e. the first part of the name)
 phylDistStruct.genus = cell(length(phylDistStruct.names),1);
 for i = 1:length(phylDistStruct.genus)
    name = phylDistStruct.names{i};
    phylDistStruct.genus{i} = lower(name(1:(strfind(name,' ')-1))); %convert all to lower case to avoid problems with case
 end
 %create a map for the genuses
 phylDistStruct.uniqueGenusList = unique(phylDistStruct.genus);
 phylDistStruct.genusHashMap = containers.Map(phylDistStruct.uniqueGenusList,1:length(phylDistStruct.uniqueGenusList));
 phylDistStruct.uniqueGenusIndices = cell(length(phylDistStruct.uniqueGenusList),1);
 
 %Then for each genus create a list with indices to the names
 for i = 1:length(phylDistStruct.genus)
     matchInd = cell2mat(values(phylDistStruct.genusHashMap,phylDistStruct.genus(i)));
     phylDistStruct.uniqueGenusIndices{matchInd} = [phylDistStruct.uniqueGenusIndices{matchInd};i];
 end

 %Extract relevant info from model_data:
 substrates = model_data.substrates;
 substrateIndices = model_data.substrateIndices;
 products   = model_data.products;
 productIndices = model_data.productIndices;
 EC_numbers = model_data.EC_numbers;
 MWs        = model_data.MWs;
 model      = model_data.model;
 %Create initially empty outputs:
 [mM,nM]      = size(EC_numbers);
 forw.kcats   = zeros(mM,nM);
 forw.org_s   = zeros(mM,nM);
 forw.rest_s  = zeros(mM,nM);
 forw.org_ns  = zeros(mM,nM);
 forw.rest_ns = zeros(mM,nM);
 forw.org_sa  = zeros(mM,nM);
 forw.rest_sa = zeros(mM,nM);
 forw.wcLevel = NaN(mM,nM);
 back.kcats   = zeros(mM,nM);
 back.org_s   = zeros(mM,nM);
 back.rest_s  = zeros(mM,nM);
 back.org_ns  = zeros(mM,nM);
 back.rest_ns = zeros(mM,nM);
 back.org_sa  = zeros(mM,nM);
 back.rest_sa = zeros(mM,nM);
 back.wcLevel = NaN(mM,nM);
 tot.queries  = 0;
 tot.org_s    = 0;
 tot.rest_s   = 0;
 tot.org_ns   = 0;
 tot.rest_ns  = 0;
 tot.org_sa   = 0;
 tot.rest_sa  = 0;
 tot.wc0      = 0;
 tot.wc1      = 0;
 tot.wc2      = 0;
 tot.wc3      = 0;
 tot.wc4      = 0;
 tot.matrix   = zeros(6,5);
 
 %build an EC index to speed things up a bit - many of the ECs appear
 %many times - unnecessary to compare them all
 %so, here, each EC string appears only once, and you get a vector with
 %indices to the rows in KCATcell
 [ECIndexIds,~,ic] = unique(KCATcell{1});
 EcIndexIndices = cell(length(ECIndexIds),1);
 for i = 1:length(EcIndexIndices)
     EcIndexIndices{i} = find(ic == i).';
 end
 
 
 
 %Main loop: 
  for i = 1:mM
     %Match:
     for j = 1:nM
         EC = EC_numbers{i,j};
         MW = MWs(i,j);
         if (isempty(EC))
             break;
         end
         EC = strsplit(EC,' ');
         %Try to match direct reaction:
         if ~isempty(substrates{i,1})
              [forw,tot] = iterativeMatch(EC,MW,substrates(i,:),i,j,KCATcell,...
                                          forw,tot,model,org_name,...
                                          phylDistStruct,org_index,SAcell,ECIndexIds,EcIndexIndices, substrateIndices(i,:), minAcceptableKCat);
         end
         %Repeat for inverse reaction:
         if ~isempty(products{i,1})
             [back,tot] = iterativeMatch(EC,MW,products(i,:),i,j,KCATcell,...
                                         back,tot,model,org_name,...
                                         phylDistStruct,org_index,SAcell,ECIndexIds,EcIndexIndices, productIndices(i,:), minAcceptableKCat);
         end
     end
     %Display progress:
    if rem(i,100) == 0 || i == mM
         fprintf('.')
    end
 end
  
 kcats.forw = forw;
 kcats.back = back;
 kcats.tot  = tot;
 
fprintf(' Done!\n')
 
 end
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 function [dir,tot] =iterativeMatch(EC,MW,subs,i,j,KCATcell,dir,tot,model,...
                                    name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices, subsIndices, minAcceptableKCat)
 %Will iteratively try to match the EC number to some registry in BRENDA,
 %using each time one additional wildcard.
 
 kcat    = zeros(size(EC));
 origin  = zeros(size(EC));
 matches = zeros(size(EC));
 wc_num  = ones(size(EC)).*1000;
 for k = 1:length(EC)
     success  = false;
     while ~success
         %Atempt match:
         [kcat(k),origin(k),matches(k)] = mainMatch(EC{k},MW,subs,KCATcell,...
                                                    model,i,name,phylDist,...
                                                    org_index,SAcell,ECIndexIds,EcIndexIndices,subsIndices, minAcceptableKCat);
         %If any match found, ends. If not, introduces one extra wild card and
         %tries again:
         if origin(k) > 0
             success   = true;
             wc_num(k) = sum(EC{k}=='-');
         else
             dot_pos  = [2 strfind(EC{k},'.')];
             wild_num = sum(EC{k}=='-');
             wc_text  = '-.-.-.-';
             EC{k}    = [EC{k}(1:dot_pos(4-wild_num)) wc_text(1:2*wild_num+1)];
         end
     end
 end
 
 if sum(origin) > 0
     %For more than one EC: Choose the maximum value among the ones with the
     %less amount of wildcards and the better origin:
     best_pos   = (wc_num == min(wc_num));
     new_origin = origin(best_pos);
     best_pos   = (origin == min(new_origin(new_origin~=0)));
     max_pos    = find(kcat == max(kcat(best_pos)));
     wc_num     = wc_num(max_pos(1));
     origin     = origin(max_pos(1));
     matches    = matches(max_pos(1));
     kcat       = kcat(max_pos(1));
     
     %Update dir and tot:
     dir.kcats(i,j)   = kcat;
     dir.org_s(i,j)   = matches*(origin == 1);
     dir.rest_s(i,j)  = matches*(origin == 2);
     dir.org_ns(i,j)  = matches*(origin == 3);
     dir.org_sa(i,j)  = matches*(origin == 4);
     dir.rest_ns(i,j) = matches*(origin == 5);    
     dir.rest_sa(i,j) = matches*(origin == 6);
     dir.wcLevel(i,j) = wc_num;
     tot.org_s        = tot.org_s   + (origin == 1);
     tot.rest_s       = tot.rest_s  + (origin == 2);
     tot.org_ns       = tot.org_ns  + (origin == 3);
     tot.org_sa       = tot.org_sa  + (origin == 4);
     tot.rest_ns      = tot.rest_ns + (origin == 5);    
     tot.rest_sa      = tot.rest_sa + (origin == 6);
     tot.wc0          = tot.wc0     + (wc_num == 0);
     tot.wc1          = tot.wc1     + (wc_num == 1);
     tot.wc2          = tot.wc2     + (wc_num == 2);
     tot.wc3          = tot.wc3     + (wc_num == 3);
     tot.wc4          = tot.wc4     + (wc_num == 4);
     tot.queries      = tot.queries + 1;
     tot.matrix(origin,wc_num+1) = tot.matrix(origin,wc_num+1) + 1;
 end
 
 end
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 function [kcat,origin,matches] = mainMatch(EC,MW,subs,KCATcell,model,i,...
                                      name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices,subsIndices, minAcceptableKCat)
                                                                   
 %First make the string matching. This takes time, so we only want to do
 %this once:
 %Relaxes matching if wild cards are present:
 wild     = false;
 wild_pos = strfind(EC,'-');
 if ~isempty(wild_pos)
     EC   = EC(1:wild_pos(1)-1);
     wild = true;
 end
 stringMatchesEC_cell = extract_string_matches(EC,KCATcell{1},wild,ECIndexIds,EcIndexIndices);

 % Matching function prioritizing organism and substrate specificity when 
 % available.
 
 origin = 0;
 %First try to match organism and substrate:
 [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,true,false,model,i,...
                            phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat);                      
 if matches > 0
     origin = 1;
 %If no match, try the closest organism but match the substrate:
 else   
    [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',true,false,model,i,...
                               phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat);
     if matches > 0
         origin = 2;
     %If no match, try to match organism but with any substrate:
     else
         [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,false,false,...
                                    model,i,phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat);
         if matches > 0
             origin = 3;
         %If no match, try to match organism but for any substrate (SA*MW):
         else
              %create matching index for SA, has not been needed until now
              stringMatchesSA = extract_string_matches(EC,SAcell{1},wild,[],[]);
             
              [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,false,...
                                         true,model,i,phylDist,org_index,...
                                         SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat);
              if matches > 0
                  origin = 4; 
             %If no match, try any organism and any substrate:
              else
                 [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',false,...
                                            false,model,i,phylDist,...
                                            org_index,SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat);
                 if matches > 0
                     origin = 5;
                 %Again if no match, look for any org and SA*MW    
                  else
                      [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',...
                                                 false,true,model,i,phylDist,...
                                                 org_index,SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat);
                      if matches > 0
                          origin = 6;
                      end
                 end
                         
              end    
         end
     end
 end
 end
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 function [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,organism,...
                                     substrate,SA,model,i,phylDist,...
                                     org_index,SAcell,KCATcellMatches,SAcellMatches,subsIndices,minAcceptableKCat)
                  
 %Will go through BRENDA and will record any match. Afterwards, it will
 %return the average value and the number of matches attained.
 kcat    = [];
 matches = 0;
 
 if SA
     %SAcell{1},wild,[],[]
     EC_indexes = extract_indexes(SAcellMatches,[],SAcell{2},subs,substrate,...
                                  organism,org_index,phylDist);

     kcat       = SAcell{3}(EC_indexes);
     org_cell   = SAcell{2}(EC_indexes);
     MW_BRENDA  = SAcell{4}(EC_indexes);
     
     %to handle bad kcat values that totally dominate the modeling, we do
     %not accept a lower kcat than 1 s^-1, i.e. 3600 h^-1
     %need to handle this in several places, since it is sometimes modified
     %for stoichiometry
     kcat(kcat < minAcceptableKCat) = minAcceptableKCat;
 else
     %KCATcell{1},wild,ECIndexIds,EcIndexIndices
     EC_indexes = extract_indexes(KCATcellMatches,KCATcell{2},KCATcell{3},...
                                  subs,substrate,organism,org_index,...
                                  phylDist);
     if substrate
         for j = 1:length(EC_indexes)
             indx = EC_indexes(j);
             for k = 1:length(subs)
                 if (isempty(subs{k}))
                     break;
                 end
                 %l = logical(strcmpi(model.metNames,subs{k}).*(model.S(:,i)~=0)); %I don't understand the .* (model.S(:,i)~=0) part, it shouldn't be needed?/JG;
                 l = subsIndices(k);
                 if ~isempty(subs{k}) && strcmpi(subs{k},KCATcell{2}(indx))
                     if KCATcell{4}(indx) > 0 
                         coeff = min(abs(model.S(l,i)));
                         kCatTmp = KCATcell{4}(indx);
                         %to handle bad kcat values that totally dominate the modeling, we do
                         %not accept a lower kcat than 1 s^-1, i.e. 3600 h^-1
                         %need to handle this in several places, since it is sometimes modified
                         %for stoichiometry
                         if kCatTmp < minAcceptableKCat
                             kCatTmp = minAcceptableKCat;
                         end
                         
                         kcat  = [kcat;kCatTmp/coeff];
                     end
                 end
             end
         end
     else
         kcat = KCATcell{4}(EC_indexes);
         kcat(kcat < minAcceptableKCat) = minAcceptableKCat;
     end
 end                         
 %Return maximum value:
 if isempty(kcat)
     kcat = 0;
 else
     matches        = length(kcat);
     [kcat,MaxIndx] = max(kcat);
%      if SA
%          % If the match correspond to a SA*Mw value for the model's
%          % organism the kcat will be corrected with the sequence based Mw
%          if strcmpi(organism,org_cell(MaxIndx))
%              kcat = kcat*MW/MW_BRENDA(MaxIndx);
%          end
%      end        
 end
 %Avoid SA*Mw values over the diffusion limit rate  [Bar-Even et al. 2011]
 if kcat>(1E7*3600)
     kcat = 1E7*3600;
 end
 end
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %Make the string matches of the ECs. This is heavy, so only do it once! 
 %
 function EC_indexes = extract_string_matches(EC,EC_cell,wild,ECIndexIds,EcIndexIndices)
 EC_indexes = [];
 EC_indexesOld = [];
     if wild
        if (~isempty(ECIndexIds)) %In some cases the EC_cell is not from KCatCell
            X = find(contains(ECIndexIds, EC));
            for j = 1:length(X)
                EC_indexes = [EC_indexes,EcIndexIndices{X(j)}];
            end
        else %Not optimized
           for j=1:length(EC_cell)
                if strfind(EC_cell{j},EC)==1
                    EC_indexes = [EC_indexes,j];
                end
           end         
        end
     else
         if (~isempty(ECIndexIds)) %In some cases the EC_cell is not from KCatCell
             mtch = find(strcmpi(EC,ECIndexIds));
             if (~isempty(mtch))
                 EC_indexes = EcIndexIndices{mtch};
             end
         else %%Not optimized 
             if ~isempty(EC_cell)
                EC_indexes = transpose(find(strcmpi(EC,EC_cell)));
             end
         end
     end

 end
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %Extract the indexes of the entries in the BRENDA data that meet the 
 %conditions specified by the search criteria
 function EC_indexes = extract_indexes(EC_indCellStringMatches,subs_cell,orgs_cell,subs,...
                                       substrate,organism, org_index,...
                                       phylDist)
 
 EC_indexes = EC_indCellStringMatches;%reuse so the string comparisons are not run many times
 
 %If substrate=true then it will extract only the substrates appereances 
 %indexes in the EC subset from the BRENDA cell array
 
 if substrate
     if (~isempty(EC_indexes)) %optimization
         Subs_indexes = [];
         for l = 1:length(subs)
             if (isempty(subs{l}))
                 break;
             end
             Subs_indexes = horzcat(Subs_indexes,EC_indexes(strcmpi(subs(l),...
                                    subs_cell(EC_indexes))));          
         end
         EC_indexes = Subs_indexes;    
     end
 end
 
 EC_orgs = orgs_cell(EC_indexes);
 %If specific organism values are requested looks for all the organism
 %repetitions on the subset BRENDA cell array(EC_indexes)
 if string(organism) ~= ''  
     EC_indexes = EC_indexes(strcmpi(string(organism),EC_orgs));
 
 %If KEGG code was assigned to the organism (model) then it will look for   
 %the Kcat value for the closest organism
 elseif org_index~='*' %&& org_index~=''
     KEGG_indexes = [];temp = [];
     
     %For relating a phyl dist between the modelled organism and the organisms
     %on the BRENDA cell array it should search for a KEGG code for each of 
     %these 
     for j=1:length(EC_indexes)
         %Assigns a KEGG index for those found on the KEGG struct
         orgs_index = find(strcmpi(orgs_cell(EC_indexes(j)),phylDist.names),1);
         if ~isempty(orgs_index)
             KEGG_indexes = [KEGG_indexes; orgs_index];
             temp         = [temp;EC_indexes(j)];
         %For values related to organisms without KEGG code, then it will
         %look for KEGG code for the first organism with the same genus
         else 
             org = orgs_cell{EC_indexes(j)};
             orgGenus = lower(org(1:(strfind(org,' ')-1)));
             if isKey(phylDist.genusHashMap,orgGenus) %annoyingly, this seems to be needed
                 matchInd = cell2mat(values(phylDist.genusHashMap,{orgGenus}));
                 matches = phylDist.uniqueGenusIndices{matchInd};
                 k = matches(1);
                 k2 = k;%tmp, remove later
                 orgs_index   = k; 
                 KEGG_indexes = [KEGG_indexes;k];
                 temp         = [temp;EC_indexes(j)];
             end
         end
     end
     %Update the EC_indexes cell array
     EC_indexes = temp;
     %Looks for the taxonomically closest organism and saves the index of
     %its appearences in the BRENDA cell
     if ~isempty(EC_indexes)
         distances = phylDist.distMat(org_index,KEGG_indexes);
         EC_indexes = EC_indexes(distances == min(distances));
     end
 end
 end 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 function org_index = find_inKEGG(org_name,names)
     org_index      = find(strcmpi(org_name,names));
     if isempty(org_index)
         i=1;
         while isempty(org_index) && i<length(names)
             str = names{i};
             if strcmpi(org_name(1:strfind(org_name,' ')-1),...
                 str(1:strfind(str,' ')-1))
                 org_index = i;
             end
             i = i+1;
         end
         if isempty(org_index);org_index = '*';end
     end
 end
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 function phylDistStruct =  KEGG_struct
     load('../../databases/PhylDist.mat')
     phylDistStruct.ids   = transpose(phylDistStruct.ids);
     phylDistStruct.names = transpose(phylDistStruct.names);
     
     for i=1:length(phylDistStruct.names)
         pos = strfind(phylDistStruct.names{i}, ' (');
         if ~isempty(pos)
             phylDistStruct.names{i} = phylDistStruct.names{i}(1:pos-1);
         end
     end
 end