diff --git a/alignment/alignment.cpp b/alignment/alignment.cpp index 7e8fb8313..3ff67c1a3 100644 --- a/alignment/alignment.cpp +++ b/alignment/alignment.cpp @@ -1250,44 +1250,94 @@ void Alignment::orderPatternByNumChars(int pat_type) { void Alignment::ungroupSitePattern() { + // assign an individual pattern to each site vector stored_pat = (*this); clear(); + IntVector stored_site_pattern = site_pattern; for (size_t i = 0; i < getNSite(); ++i) { Pattern pat = stored_pat[getPatternID(i)]; pat.frequency = 1; push_back(pat); site_pattern[i] = i; } + ASSERT(getNPattern() == getNSite()); pattern_index.clear(); + // update the pattern_first_site map + pattern_first_site = site_pattern; + // refill the existing pattern-specific parameters + if (isSSF()) { + vector stored_ptn_state_freq = ptn_state_freq; + ptn_state_freq.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_state_freq.push_back(stored_ptn_state_freq[stored_site_pattern[ptn]]); + } + } + if (isSSR()) { + vector stored_ptn_rate_scaler = ptn_rate_scaler; + ptn_rate_scaler.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_rate_scaler.push_back(stored_ptn_rate_scaler[stored_site_pattern[ptn]]); + } + } } -void Alignment::regroupSitePattern(int groups, IntVector& site_group) +void Alignment::regroupSitePattern(int groups, const IntVector &site_group) { + // subdivide patterns based on their assignment to the provided groups vector stored_pat = (*this); - IntVector stored_site_pattern = site_pattern; clear(); + IntVector stored_site_pattern = site_pattern; site_pattern.clear(); site_pattern.resize(stored_site_pattern.size(), -1); - size_t count = 0; - for (int g = 0; g < groups; g++) { + size_t cnt = 0; + for (int g = 0; g < groups; ++g) { pattern_index.clear(); - for (size_t i = 0; i < site_group.size(); ++i) - if (site_group[i] == g) { - count++; - Pattern pat = stored_pat[stored_site_pattern[i]]; - addPattern(pat, i); + for (size_t i = 0; i < getNSite(); ++i) { + if (site_group[i] == g) { + cnt ++; + Pattern pat = stored_pat[stored_site_pattern[i]]; + addPattern(pat, i); + } } } - ASSERT(count == stored_site_pattern.size()); - count = 0; - for (iterator it = begin(); it != end(); ++it) - count += it->frequency; - ASSERT(count == getNSite()); + ASSERT(cnt == getNSite()); + // check pattern frequencies of the new patterns + cnt = 0; + for (iterator it = begin(); it != end(); ++it) { + cnt += it->frequency; + } + ASSERT(cnt == getNSite()); pattern_index.clear(); //printPhylip("/dev/stdout"); + // update the pattern_first_site map + pattern_first_site = IntVector(getNPattern(), -1); + for (size_t i = 0; i < getNSite(); ++i) { + if (pattern_first_site[site_pattern[i]] == -1) { + pattern_first_site[site_pattern[i]] = i; + } + } + // refill the existing pattern-specific parameters + IntVector new_to_old_pattern; + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + int i = pattern_first_site[ptn]; + new_to_old_pattern.push_back(stored_site_pattern[i]); + } + if (isSSF()) { + vector stored_ptn_state_freq = ptn_state_freq; + ptn_state_freq.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_state_freq.push_back(stored_ptn_state_freq[new_to_old_pattern[ptn]]); + } + } + if (isSSR()) { + vector stored_ptn_rate_scaler = ptn_rate_scaler; + ptn_rate_scaler.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_rate_scaler.push_back(stored_ptn_rate_scaler[new_to_old_pattern[ptn]]); + } + } } - /** detect the data type of the input sequences @param sequences vector of strings @@ -3680,12 +3730,17 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq pattern_freq->resize(aln->getNPattern(), 0); } - if (!aln->site_state_freq.empty()) { - // resampling also the per-site state frequency vector - if (aln->site_state_freq.size() != aln->getNPattern() || spec) + if (aln->isSSF()) { + // resampling also the per-site state frequency vectors + if (aln->ptn_state_freq.size() != aln->getNPattern() || spec) outError("Unsupported bootstrap feature, pls contact the developers"); } - + if (aln->isSSR()) { + // resampling also the per-site rate scalers + if (aln->ptn_rate_scaler.size() != aln->getNPattern() || spec) + outError("Unsupported bootstrap feature, pls contact the developers"); + } + if (Params::getInstance().jackknife_prop > 0.0 && spec) { outError((string)"Unsupported jackknife with sampling " + spec); } @@ -3702,11 +3757,16 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq Pattern pat = aln->at(ptn_id); int nptn = getNPattern(); addPattern(pat, added_sites); - if (!aln->site_state_freq.empty() && getNPattern() > nptn) { - // a new pattern is added, copy state frequency vector - double *state_freq = new double[num_states]; - memcpy(state_freq, aln->site_state_freq[ptn_id], num_states*sizeof(double)); - site_state_freq.push_back(state_freq); + if (aln->isSSF() && getNPattern() > nptn) { + // a new pattern is added, copy its state frequency vector + double *state_freqs = new double[num_states]; + memcpy(state_freqs, aln->ptn_state_freq[ptn_id], num_states*sizeof(double)); + ptn_state_freq.push_back(state_freqs); + } + if (aln->isSSR() && getNPattern() > nptn) { + // a new pattern is added, copy its rate scaler + double rate_scaler = aln->ptn_rate_scaler[ptn_id]; + ptn_rate_scaler.push_back(rate_scaler); } if (pattern_freq) ((*pattern_freq)[ptn_id])++; added_sites++; @@ -3783,10 +3843,8 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq out_site += site_vec[part+1]; } } - if (!aln->site_state_freq.empty()) { - site_model = site_pattern; - ASSERT(site_state_freq.size() == getNPattern()); - } + if (aln->isSSF()) ASSERT(ptn_state_freq.size() == getNPattern()); + if (aln->isSSR()) ASSERT(ptn_rate_scaler.size() == getNPattern()); verbose_mode = save_mode; countConstSite(); // buildSeqStates(); @@ -3940,11 +3998,8 @@ void Alignment::buildFromPatternFreq(Alignment & aln, IntVector new_pattern_freq site_pattern[site++] = size()-1; } } - if (!aln.site_state_freq.empty()) { - site_model = site_pattern; - ASSERT(site_state_freq.size() == getNPattern()); - } - + if (aln.isSSF()) ASSERT(ptn_state_freq.size() == getNPattern()); + if (aln.isSSR()) ASSERT(ptn_rate_scaler.size() == getNPattern()); countConstSite(); // buildSeqStates(); // checkSeqName(); @@ -4221,11 +4276,11 @@ Alignment::~Alignment() non_stop_codon = nullptr; delete [] pars_lower_bound; pars_lower_bound = nullptr; - for (auto it = site_state_freq.rbegin(); it != site_state_freq.rend(); ++it) { - delete [] (*it); + for (auto rit = ptn_state_freq.rbegin(); rit != ptn_state_freq.rend(); ++rit) { + delete [] (*rit); } - site_state_freq.clear(); - site_model.clear(); + ptn_state_freq.clear(); + ptn_rate_scaler.clear(); } double Alignment::computeObsDist(int seq1, int seq2) { @@ -5617,92 +5672,111 @@ double Alignment::multinomialProb (IntVector &pattern_freq) return (fac - sumFac + sumProb); } -bool Alignment::readSiteStateFreq(const char* site_freq_file) +bool Alignment::readSiteParamFile(const char* site_param_file, const string ¶m_type) { - cout << endl << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl; - site_model.resize(getNSite(), -1); - IntVector pattern_to_site; // vector from pattern to the first site - pattern_to_site.resize(getNPattern(), -1); - for (size_t i = 0; i < getNSite(); ++i) { - if (pattern_to_site[getPatternID(i)] == -1) { - pattern_to_site[getPatternID(i)] = i; - } - } - bool aln_changed = false; - + ASSERT((param_type == "freq" && ptn_state_freq.empty()) || + (param_type == "rate" && ptn_rate_scaler.empty())); + string msg = (param_type == "freq") ? "state frequency" : "rate"; + cout << endl << "Reading site-specific " << msg << " file " << site_param_file << " ..." << endl; + bool aln_changed = false; + int specified_sites = 0; + IntVector site_model(getNSite(), -1); // map each site to a model + vector models; // site-specific frequency vectors or rate scalers + // fill the pattern_first_site map + pattern_first_site = IntVector(getNPattern(), -1); + for (size_t i = 0; i < getNSite(); ++i) { + if (pattern_first_site[site_pattern[i]] == -1) { + pattern_first_site[site_pattern[i]] = i; + } + } + // read the input file try { - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(site_freq_file); - double freq; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(site_param_file); + in.exceptions(ios::badbit); + int prev_site = -1; + while (true) { string site_spec; - int specified_sites = 0; - in.exceptions(ios::badbit); - for (int model_id = 0; !in.eof(); model_id++) { - // remove the failbit - in >> site_spec; - if (in.eof()) break; - IntVector site_id; - extractSiteID(this, site_spec.c_str(), site_id); - specified_sites += site_id.size(); - if (site_id.size() == 0) throw "No site ID specified"; - for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) { - if (site_model[*it] != -1) throw "Duplicated site ID"; - site_model[*it] = site_state_freq.size(); - } - double *site_freq_entry = new double[num_states]; - double sum = 0; - for (int i = 0; i < num_states; ++i) { + // remove the failbit + in >> site_spec; + if (in.eof()) break; + // handle the line site ids + IntVector site_ids; // sites specified in a single line + extractSiteID(this, site_spec.c_str(), site_ids); + if (site_ids.size() == 0) throw "No site ID specified"; + if (site_ids.size() > 1) throw "Muptiple site IDs in a single line"; + int site = site_ids[0]; + if (prev_site == site) throw "Duplicated site ID"; + if (prev_site > site) throw "Wrong order of sites"; + prev_site = site; + specified_sites ++; + ASSERT(site_model[site] == -1); + site_model[site] = models.size(); + // handle the line params + double *site_param_entry; + if (param_type == "freq") { + double freq; + double *state_freqs = new double[num_states]; + double sum = 0.0; + for (int x = 0; x < num_states; ++x) { in >> freq; if (freq <= 0.0 || freq >= 1.0) throw "Frequencies must be strictly positive and smaller than 1"; - site_freq_entry[i] = freq; + state_freqs[x] = freq; sum += freq; } - if (fabs(sum-1.0) > 1e-4) { - if (fabs(sum-1.0) > 1e-3) - outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized"); - sum = 1.0/sum; - for (int i = 0; i < num_states; ++i) - site_freq_entry[i] *= sum; - } - convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0) - - // 2016-02-01: now check for equality of sites with same site-pattern and same freq - int prev_site = pattern_to_site[getPatternID(site_id[0])]; - if (site_id.size() == 1 && prev_site < site_id[0] && site_model[prev_site] != -1) { - // compare freq with prev_site - bool matched_freq = true; - double *prev_freq = site_state_freq[site_model[prev_site]]; - for (int i = 0; i < num_states; ++i) { - if (site_freq_entry[i] != prev_freq[i]) { - matched_freq = false; - break; - } - } - if (matched_freq) { - site_model[site_id[0]] = site_model[prev_site]; - } else - aln_changed = true; - } - - if (site_model[site_id[0]] == site_state_freq.size()) - site_state_freq.push_back(site_freq_entry); - else - delete [] site_freq_entry; + if (fabs(sum - 1.0) > 1e-4) { + outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized"); + for (int x = 0; x < num_states; ++x) { + state_freqs[x] /= sum; + } + } + convfreq(state_freqs); // regularize freqs (if some freqs are too close to 0) + site_param_entry = state_freqs; + } else { + double rate; + double *rate_scaler = new double; + in >> rate; + if (rate < 0.0 || rate > 100.0) throw "Rates must be non-negative and not higher than 100"; + rate = max(rate, MIN_SITE_RATE); // regularize rate (if it is too close to 0) + *rate_scaler = rate; + site_param_entry = rate_scaler; } - if (specified_sites < site_model.size()) { - aln_changed = true; - // there are some unspecified sites - cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl; - for (size_t i = 0; i < site_model.size(); ++i) - if (site_model[i] == -1) - site_model[i] = site_state_freq.size(); - site_state_freq.push_back(NULL); + // add the model only if it is its first occurence for the pattern of the current site + int first_site = pattern_first_site[site_pattern[site]]; + if (first_site < site && site_model[first_site] != -1) { + // compare the site param with the first_site param + bool matched_param = true; + double *first_site_param_entry = models[site_model[first_site]]; + if (param_type == "freq") { + for (int x = 0; x < num_states; ++x) { + if (site_param_entry[x] != first_site_param_entry[x]) { + matched_param = false; + break; + } + } + } else { + if (*site_param_entry != *first_site_param_entry) { + matched_param = false; + } + } + if (matched_param) { + // the only case when we do not add the model + site_model[site] = site_model[first_site]; + } else { + aln_changed = true; + } + } // else: the current site is effectively the first one of its pattern + if (site_model[site] == models.size()) { + models.push_back(site_param_entry); + } else { + delete [] site_param_entry; } - in.clear(); - // set the failbit again - in.exceptions(ios::failbit | ios::badbit); - in.close(); + } + in.clear(); + // set the failbit again + in.exceptions(ios::failbit | ios::badbit); + in.close(); } catch (const char* str) { outError(str); } catch (string str) { @@ -5710,12 +5784,91 @@ bool Alignment::readSiteStateFreq(const char* site_freq_file) } catch(ios::failure) { outError(ERR_READ_INPUT); } - if (aln_changed) { - cout << "Regrouping alignment sites..." << endl; - regroupSitePattern(site_state_freq.size(), site_model); - } - cout << site_state_freq.size() << " distinct per-site state frequency vectors detected" << endl; - return aln_changed; + // check for unspecified sites + if (specified_sites < getNSite()) { + aln_changed = true; + msg = (param_type == "freq") ? "frequencies" : "rates"; + cout << site_model.size() - specified_sites << " unspecified sites will get default " << msg << endl; + for (size_t site = 0; site < getNSite(); ++site) { + if (site_model[site] == -1) { + site_model[site] = models.size(); + } + } + double *default_param_entry; + if (param_type == "freq") default_param_entry = NULL; else *default_param_entry = -1.0; + models.push_back(default_param_entry); + } + // if needed, subdivide patterns so that sites in each new pattern have same freqs and rates + if (aln_changed) { + cout << "Regrouping alignment sites..." << endl; + regroupSitePattern(models.size(), site_model); + } + // fill the selected pattern-specific parameter with the contents of the models + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + int first_site = pattern_first_site[ptn]; + if (param_type == "freq") { + ptn_state_freq.push_back(models[site_model[first_site]]); + } else { + ptn_rate_scaler.push_back(*(models[site_model[first_site]])); + } + } + msg = (param_type == "freq") ? "state frequency vectors" : "rate scalers"; + cout << models.size() << " distinct per-site " << msg << " detected" << endl; + return aln_changed; +} + +double Alignment::normalizePtnRateScaler() +{ + // the goal is to end up with mean rate == 1.0, + // some rates may become > MAX_SITE_RATE, but + // all rates must stay >= MIN_SITE_RATE + ASSERT(ptn_rate_scaler.size()); + int ptnf; + double rate; + size_t nptn = getNPattern(); + // calculate the mean rate value + size_t cnt = 0; + double sum = 0.0; + for (size_t ptn = 0; ptn < nptn; ++ptn) { + ptnf = at(ptn).frequency; + rate = ptn_rate_scaler[ptn]; + if (rate != -1.0) { + cnt += ptnf; + sum += rate * ptnf; + } + } + double mean = sum / cnt; + if (fabs(mean - 1.0) <= 1e-4) return 1.0; + // normalization: divide rates by their mean value + bool regularize = false; + size_t cnt_fast = 0; + double sum_slow = 0.0, sum_fast = 0.0; + for (size_t ptn = 0; ptn < nptn; ++ptn) { + ptnf = at(ptn).frequency; + rate = ptn_rate_scaler[ptn]; + rate = (rate != -1.0) ? (rate / mean) : 1.0; + ptn_rate_scaler[ptn] = rate; + // data for regularization + if (rate > 1.0) { + cnt_fast += ptnf; + sum_fast += rate * ptnf; + } else if (rate < MIN_SITE_RATE) { + regularize = true; + sum_slow += MIN_SITE_RATE * ptnf; + } else { + sum_slow += rate * ptnf; + } + } + if (!regularize) return mean; + // regularization: bound min rates and scale down the above-one parts of the fast rates + double coef = (getNSite() - cnt_fast - sum_slow) / (sum_fast - cnt_fast); + for (size_t ptn = 0; ptn < nptn; ++ptn) { + rate = ptn_rate_scaler[ptn]; + if (rate < MIN_SITE_RATE) rate = MIN_SITE_RATE; + if (rate > 1.0) rate = rate * coef + 1.0 - coef; + ptn_rate_scaler[ptn] = rate; + } + return mean; } /** diff --git a/alignment/alignment.h b/alignment/alignment.h index 934591a80..ee02ce481 100644 --- a/alignment/alignment.h +++ b/alignment/alignment.h @@ -1,7 +1,7 @@ // // C++ Interface: alignment // -// Description: +// Description: // // // Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler , (C) 2008 @@ -319,21 +319,20 @@ class Alignment : public vector, public CharSet, public StateSpace { virtual void orderPatternByNumChars(int pat_type); /** - * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns + * un-group site-patterns, i.e. making #sites = #patterns and pattern frequency = 1 for all patterns */ void ungroupSitePattern(); - /** - * re-group site-patterns + * re-group site-patterns so that sites within each pattern fall into the same group * @param groups number of groups * @param site_group group ID (0, 1, ...ngroups-1; must be continuous) of all sites */ - void regroupSitePattern(int groups, IntVector &site_group); + void regroupSitePattern(int groups, const IntVector &site_group); /**************************************************************************** - output alignment + output alignment ****************************************************************************/ SeqType detectSequenceType(StrVector &sequences); @@ -535,9 +534,11 @@ class Alignment : public vector, public CharSet, public StateSpace { */ bool isGapOnlySeq(size_t seq_id); - virtual bool isSuperAlignment() { - return false; - } + bool isSSF() { return !ptn_state_freq.empty(); } + + bool isSSR() { return !ptn_rate_scaler.empty(); } + + virtual bool isSuperAlignment() { return false; } /**************************************************************************** alignment general processing @@ -916,13 +917,13 @@ class Alignment : public vector, public CharSet, public StateSpace { vector pomo_sampled_states; IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present - /* for site-specific state frequency model with Huaichun, Edward, Andrew */ - - /* site to model ID map */ - IntVector site_model; - - /** site to state frequency vector */ - vector site_state_freq; + /* for site-specific models with Huaichun, Edward, Andrew */ + + /** pattern index to state frequency vector map */ + vector ptn_state_freq; + + /** pattern index to rate scaler map */ + vector ptn_rate_scaler; /** * @return true if data type is SEQ_CODON and state is a stop codon @@ -985,14 +986,20 @@ class Alignment : public vector, public CharSet, public StateSpace { void getAppearance(StateType state, StateBitset &state_app); /** - * read site specific state frequency vectors from a file to create corresponding model - * update site_model and site_state_freq variables for this class - * @param aln input alignment - * @param site_freq_file file name - * @return TRUE if alignment needs to be changed, FALSE otherwise + * read site-specific state frequency vectors or site-specific rate (branch length) scalers + * from a file to create a site-specific model + * @param site_param_file input file name + * @param param_type should be set to "freq" or "rate" + * @return TRUE if alignment patterns need to be changed, FALSE otherwise */ - bool readSiteStateFreq(const char* site_freq_file); - + bool readSiteParamFile(const char* site_param_file, const string ¶m_type); + + /** + * normalize site-specific rates by their mean value + * @return mean rate before normalization + */ + double normalizePtnRateScaler(); + /** * special initialization for codon sequences, e.g., setting #states, genetic_code * @param sequence_type user-defined sequence type @@ -1011,32 +1018,23 @@ class Alignment : public vector, public CharSet, public StateSpace { void extractMapleFile(const std::string& aln_name, const InputType& format); protected: + /** sequence names */ + vector seq_names; + /** expected num_sites */ + int expected_num_sites = -1; - /** - sequence names - */ - vector seq_names; - - /** - expected num_sites - */ - int expected_num_sites = -1; + /** site to pattern index map */ + IntVector site_pattern; - /** - Site to pattern index - */ - IntVector site_pattern; + /** pattern index to first pattern site map */ + IntVector pattern_first_site; - /** - hash map from pattern to index in the vector of patterns (the alignment) - */ - PatternIntMap pattern_index; - - /** - alisim: caching ntfreq if it has already randomly initialized - */ - double* cache_ntfreq = NULL; + /** hash map from pattern to index in the vector of patterns (the alignment) */ + PatternIntMap pattern_index; + + /** alisim: caching ntfreq if it has already randomly initialized */ + double* cache_ntfreq = NULL; private: /** diff --git a/alignment/alignmentpairwise.cpp b/alignment/alignmentpairwise.cpp index 8b39bfeac..8adf30764 100644 --- a/alignment/alignmentpairwise.cpp +++ b/alignment/alignmentpairwise.cpp @@ -202,7 +202,6 @@ double AlignmentPairwise::computeFunction(double value) { auto sequence2 = tree->getConvertedSequenceByNumber(seq_id2); auto frequencies = tree->getConvertedSequenceFrequencies(); size_t sequenceLength = tree->getConvertedSequenceLength(); - if (site_rate->isSiteSpecificRate()) { for (int i = 0; i < sequenceLength; i++) { int state1 = sequence1[i]; @@ -221,7 +220,7 @@ double AlignmentPairwise::computeFunction(double value) { if (state1 >= num_states || state2 >= num_states) { continue; } - double trans = tree->getModelFactory()->computeTrans(value * site_rate->getPtnRate(i), state1, state2); + double trans = tree->getModel()->computeTrans(value, model->getPtnModelID(i), state1, state2); lh -= log(trans) * frequencies[i]; } return lh; @@ -358,10 +357,10 @@ void AlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) { continue; } double freq = frequencies[i]; - double rate_val = site_rate->getPtnRate(i); + double rate_val = 1.0; double rate_sqr = rate_val * rate_val; double derv1, derv2; - double trans = tree->getModel()->computeTrans(value * rate_val,model->getPtnModelID(i), state1, state2, derv1, derv2); + double trans = tree->getModel()->computeTrans(value * rate_val, model->getPtnModelID(i), state1, state2, derv1, derv2); double d1 = derv1 / trans; df -= rate_val * d1 * freq; ddf -= rate_sqr * (derv2/trans - d1*d1) * freq; @@ -376,10 +375,10 @@ void AlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) { if (num_states<=state2) { continue; } - double rate_val = site_rate->getPtnRate(i); + double rate_val = 1.0; double rate_sqr = rate_val * rate_val; double derv1, derv2; - double trans = tree->getModel()->computeTrans(value * rate_val,model->getPtnModelID(i), state1, state2, derv1, derv2); + double trans = tree->getModel()->computeTrans(value * rate_val, model->getPtnModelID(i), state1, state2, derv1, derv2); double d1 = derv1 / trans; double freq = tree->aln->at(i).frequency; df -= rate_val * d1 * freq; diff --git a/alignment/superalignment.cpp b/alignment/superalignment.cpp index 35be87eb2..038d2e1df 100644 --- a/alignment/superalignment.cpp +++ b/alignment/superalignment.cpp @@ -51,13 +51,13 @@ SuperAlignment::SuperAlignment() : Alignment() { SuperAlignment::SuperAlignment(Params ¶ms) : Alignment() { readFromParams(params); - + init(); - + // only show Degree of missing data if AliSim is inactive or an input alignment is supplied if (!(Params::getInstance().alisim_active && !Params::getInstance().alisim_inference_mode)) cout << "Degree of missing data: " << computeMissingData() << endl; - + #ifdef _OPENMP if (params.num_threads > partitions.size()) { cout << "Info: multi-threading strategy over alignment sites" << endl; @@ -65,8 +65,6 @@ SuperAlignment::SuperAlignment(Params ¶ms) : Alignment() cout << "Info: multi-threading strategy over partitions" << endl; } #endif - cout << endl; - } void SuperAlignment::readFromParams(Params ¶ms) { @@ -298,9 +296,9 @@ void SuperAlignment::readPartitionRaxml(Params ¶ms) { input_aln->sequence_type = params.sequence_type; ((SuperAlignment*) input_aln)->init(); } - - cout << endl << "Partition file is not in NEXUS format, assuming RAxML-style partition file..." << endl; - + + cout << "Partition file is not in NEXUS format, assuming RAxML-style partition file..." << endl; + size_t pos = params.model_name.find_first_of("+*"); string rate_type = ""; if (pos != string::npos) rate_type = params.model_name.substr(pos); @@ -499,9 +497,9 @@ void SuperAlignment::readPartitionNexus(Params ¶ms) { if (empty_partition) { cout << "NOTE: No CharPartition defined, use all CharSets" << endl; } - - cout << endl << "Loading " << sets_block->charsets.size() << " partitions..." << endl; - + + cout << "Loading " << sets_block->charsets.size() << " partitions..." << endl; + for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++) if (empty_partition || (*it)->char_partition != "") { if ((*it)->model_name == "") diff --git a/main/phyloanalysis.cpp b/main/phyloanalysis.cpp index ac71027f4..e01cb6477 100644 --- a/main/phyloanalysis.cpp +++ b/main/phyloanalysis.cpp @@ -331,10 +331,7 @@ void reportNexusFile(ostream &out, ModelSubst *m) { void reportLinkSubstMatrix(ostream &out, Alignment *aln, ModelSubst *m) { int i, j, k; double *rate_mat = new double[m->num_states * m->num_states]; - if (!m->isSiteSpecificModel()) - m->getRateMatrix(rate_mat); - else - ((ModelSet*)m)->front()->getRateMatrix(rate_mat); + m->getRateMatrix(rate_mat); if (m->num_states <= 4) { out << "Linked rate parameter R:" << endl << endl; @@ -402,10 +399,7 @@ void reportModel(ostream &out, Alignment *aln, ModelSubst *m) { int i, j, k; ASSERT(aln->num_states == m->num_states); double *rate_mat = new double[m->num_states * m->num_states]; - if (!m->isSiteSpecificModel()) - m->getRateMatrix(rate_mat); - else - ((ModelSet*)m)->front()->getRateMatrix(rate_mat); + m->getRateMatrix(rate_mat); if (m->num_states <= 4) { out << "Rate parameter R:" << endl << endl; @@ -483,9 +477,9 @@ void reportModel(ostream &out, Alignment *aln, ModelSubst *m) { } out << "State frequencies: "; - if (m->isSiteSpecificModel()) - out << "(site specific frequencies)" << endl << endl; - else { + if (m->isSSF()) { + out << "(site-specific frequencies)" << endl << endl; + } else { // 2016-11-03: commented out as this is not correct anymore // if (!m->isReversible()) // out << "(inferred from Q matrix)" << endl; @@ -624,7 +618,6 @@ void reportModel(ostream &out, PhyloTree &tree) { tree.getRate()->full_name = "Continuous Gamma"; } } - out << "Model of substitution: " << tree.getModelName() << endl << endl; reportModel(out, tree.aln, tree.getModel()); } @@ -1077,10 +1070,32 @@ void printOutfilesInfo(Params ¶ms, IQTree &tree) { if (raxml_format_printed) cout << " in RAxML format: " << params.out_prefix << ".best_scheme" << endl; } - if ((tree.getRate()->getGammaShape() > 0 || params.partition_file) && params.print_site_rate) + + if (params.print_ancestral_sequence) { + cout << " Ancestral state: " << params.out_prefix << ".state" << endl; +// cout << " Ancestral sequences: " << params.out_prefix << ".aseq" << endl; + } + + if (params.tree_freq_file) + cout << " Site-specific state freqs: " << params.out_prefix << ".sitefreq" + << endl; + + if (params.tree_rate_file) + cout << " Site-specific rates: " << params.out_prefix << ".siterate" + << endl; + + if (params.print_site_state_freq && !params.site_freq_file && !params.tree_freq_file) + cout << " Site-specific state freqs: " << params.out_prefix << ".freq" + << endl; + + if ((params.print_site_rate & 1) && !params.site_rate_file && !params.tree_rate_file) cout << " Site-specific rates: " << params.out_prefix << ".rate" << endl; + if ((params.print_site_rate & 2) && !params.site_rate_file && !params.tree_rate_file) + cout << " Site-specific ML rates: " << params.out_prefix << ".mlrate" + << endl; + if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate) cout << " Site-rates by MH model: " << params.out_prefix << ".rate" << endl; @@ -1096,15 +1111,10 @@ void printOutfilesInfo(Params ¶ms, IQTree &tree) { if (params.print_site_prob) cout << " Site probability per rate/mix: " << params.out_prefix << ".siteprob" << endl; - + if (params.print_marginal_prob && params.optimize_params_use_hmm) cout << " Marginal probability: " << params.out_prefix << ".mprob" << endl; - if (params.print_ancestral_sequence) { - cout << " Ancestral state: " << params.out_prefix << ".state" << endl; -// cout << " Ancestral sequences: " << params.out_prefix << ".aseq" << endl; - } - if (params.write_intermediate_trees) cout << " All intermediate trees: " << params.out_prefix << ".treels" << endl; @@ -1856,25 +1866,30 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) || !(tree.aln->seq_type == SEQ_DNA || tree.aln->seq_type == SEQ_CODON || tree.aln->seq_type == SEQ_PROTEIN || tree.aln->seq_type == SEQ_BINARY || tree.aln->seq_type == SEQ_MORPH) || tree.isTreeMix()) return; - + out << "ALISIM COMMAND" << endl; - out << "--------------" << endl; - - string more_info = "For more information on using AliSim, please visit: www.iqtree.org/doc/AliSim"; - - // skip unsupported models - if (tree.getModel()->isMixture() || tree.getRate()->isHeterotachy() || tree.getModel()->isLieMarkov() || tree.aln->seq_type == SEQ_CODON) + out << "--------------" << endl << endl; + + string more_info = "For more information on using AliSim, please visit: http://www.iqtree.org/doc/AliSim"; + + /** unsupported model message */ + if (tree.getModel()->isSiteSpecificModel() || tree.getModel()->isMixture() || tree.getModel()->isLieMarkov() || tree.getRate()->isHeterotachy() || tree.aln->seq_type == SEQ_CODON) { - out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; + out << "Currently, we support exporting AliSim commands automatically from the analysis only for" << endl + << "common models of DNA, Protein, Binary and Morphological data." << endl + << "To simulate data with other models (mixture, Lie-Markov, etc.), please refer to the User Manual of AliSim. Thanks!" << endl << endl; out << more_info << endl << endl; return; } - - out << "To simulate an alignment of the same length as the original alignment, using the tree and model parameters estimated from this analysis, you can use the following command:" << endl << endl; - + + /** message 1 */ + out << "To simulate an alignment of the same length as the original alignment," << endl + << "using the tree and model parameters estimated from this analysis," << endl + << "you can use the following command:" << endl << endl; + // init alisim command - string alisim_cmd = "--alisim simulated_MSA"; - + string alisim_cmd = "iqtree --alisim simulated_MSA"; + // specify tree string tree_file(params.out_prefix); if (params.partition_file && params.partition_type == BRLEN_OPTIMIZE) @@ -1882,7 +1897,7 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) else tree_file += ".treefile"; alisim_cmd += " -t " + tree_file; - + // specify model or a partition file // if using partitions -> specify a partition file if (params.partition_file) @@ -1907,40 +1922,47 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) { string model_tr = tree.getModelNameParams(true); alisim_cmd += " -m \"" + model_tr + "\""; - + // specify num_states for morph data if (tree.aln->seq_type == SEQ_MORPH) alisim_cmd += " -st \"MORPH{" + convertIntToString(tree.aln->num_states) + "}\""; } - + // specify the length of root sequence // skip specifying sequence length for partitions if (!params.partition_file) { - string root_length = ""; int num_sites = tree.aln->getNSite() * (tree.aln->seq_type == SEQ_CODON ? 3 : 1); - root_length += " --length " + convertIntToString(num_sites); + string root_length = " --length " + convertIntToString(num_sites); alisim_cmd += root_length; } - + // output alisim cmd out << alisim_cmd << endl << endl; - - out << "To mimic the alignment used to produce this analysis, i.e. simulate an alignment of the same length as the original alignment, using the tree and model parameters estimated from this analysis *and* copying the same gap positions as the original alignment, you can use the following command:" << endl << endl; - + + /** message 2 */ + out << "To mimic the alignment used to produce this analysis," << endl + << "i.e. to simulate an alignment of the same length as the original alignment," << endl + << "using the tree and model parameters estimated from this analysis *and*" << endl + << "copying the same gap positions as the original alignment," << endl + << "you can use the following command:" << endl << endl; + if (params.aln_file) out << "iqtree -s " << params.aln_file << " --alisim mimicked_MSA" << endl << endl; else out << "iqtree -s --alisim mimicked_MSA" << endl << endl; - - out << "To simulate any number of alignments in either of the two commandlines above, use the --num-alignments options, for example mimic 100 alignments you would use the command line:" << endl << endl; + /** message 3 */ + out << "To simulate any number of alignments in either of the two commandlines above," << endl + << "use the --num-alignments options." << endl + << "For example, to mimic 100 alignments you would use the command line:" << endl << endl; if (params.aln_file) out << "iqtree -s " << params.aln_file << " --alisim mimicked_MSA --num-alignments 100" << endl << endl; else out << "iqtree -s --alisim mimicked_MSA --num-alignments 100" << endl << endl; - + + /** more info message */ out << more_info << endl << endl; } @@ -2252,10 +2274,16 @@ void restoreTaxa(IQTree &iqtree, double *saved_dist_mat, NodeVector &pruned_taxa nniInfo = iqtree.optimizeNNI(); cout << "Log-likelihood after reoptimizing full tree: " << iqtree.getCurScore() << endl; //iqtree.setBestScore(iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.model_eps)); - } } + void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { + if ((params.leastSquareBranch || params.pars_branch_length || params.bayes_branch_length) && + iqtree.isTreeMix()) { + outWarning("-lsbran or -parsbran or -bayesbran do not work with tree mixture model"); + return; + } + // -lsbran option if (!params.fixed_branch_length && params.leastSquareBranch) { cout << endl << "Computing Least Square branch lengths..." << endl; iqtree.optimizeAllBranchesLS(); @@ -2264,11 +2292,11 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { string filename = params.out_prefix; filename += ".lstree"; iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE); - cout << "Logl of tree with LS branch lengths: " << iqtree.getCurScore() << endl; + cout << "LogL of tree with LS branch lengths: " << iqtree.getCurScore() << endl; cout << "Tree with LS branch lengths written to " << filename << endl; if (params.print_branch_lengths) { if (params.manuel_analytic_approx) { - cout << "Applying Manuel's analytic approximation.." << endl; + cout << "Applying Manuel's analytic approximation..." << endl; iqtree.approxAllBranches(); } ofstream out; @@ -2281,7 +2309,7 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { } cout << "Total LS tree length: " << iqtree.treeLength() << endl; } - + // -parsbran option if (params.pars_branch_length) { cout << endl << "Computing parsimony branch lengths..." << endl; iqtree.fixNegativeBranch(true); @@ -2290,7 +2318,7 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { string filename = params.out_prefix; filename += ".mptree"; iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE); - cout << "Logl of tree with MP branch lengths: " << iqtree.getCurScore() << endl; + cout << "LogL of tree with MP branch lengths: " << iqtree.getCurScore() << endl; cout << "Tree with MP branch lengths written to " << filename << endl; if (params.print_branch_lengths) { ofstream out; @@ -2302,9 +2330,8 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { cout << "MP Branch lengths written to " << filename << endl; } cout << "Total MP tree length: " << iqtree.treeLength() << endl; - } - + // -bayesbran option if (params.bayes_branch_length) { cout << endl << "Computing Bayesian branch lengths..." << endl; iqtree.computeAllBayesianBranchLengths(); @@ -2313,7 +2340,7 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { string filename = params.out_prefix; filename += ".batree"; iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE); - cout << "Logl of tree with Bayesian branch lengths: " << iqtree.getCurScore() << endl; + cout << "LogL of tree with Bayesian branch lengths: " << iqtree.getCurScore() << endl; cout << "Tree with Bayesian branch lengths written to " << filename << endl; if (params.print_branch_lengths) { ofstream out; @@ -2325,52 +2352,55 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { cout << "Bayesian Branch lengths written to " << filename << endl; } cout << "Total Bayesian tree length: " << iqtree.treeLength() << endl; - } - } -void printSiteRates(IQTree &iqtree, const char *rate_file, bool bayes) { - try { +void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { +/** partition info */ + // -wca option + if (params.print_conaln && iqtree.isSuperTree()) { + string aln_file = (string)params.out_prefix + ".conaln"; + iqtree.aln->printAlignment(params.aln_output_format, aln_file.c_str()); + } + // -wpi option + if (params.print_partition_info && iqtree.isSuperTree()) { + ASSERT(params.print_conaln); + string aln_file = (string)params.out_prefix + ".conaln"; + string partition_info; + partition_info = (string)params.out_prefix + ".partinfo.nex"; + ((SuperAlignment*)(iqtree.aln))->printPartition(partition_info.c_str(), aln_file.c_str()); + partition_info = (string)params.out_prefix + ".partitions"; + ((SuperAlignment*)(iqtree.aln))->printPartitionRaxml(partition_info.c_str()); + } + // -wpl option + if (params.print_partition_lh && !iqtree.isSuperTree()) { + outWarning("-wpl does not work with non-partition model"); + params.print_partition_lh = false; + } + if (params.print_partition_lh && !params.pll) { + string part_lh_file = (string)params.out_prefix + ".partlh"; + printPartitionLh(part_lh_file.c_str(), &iqtree, pattern_lh); + } + // -wpbl option + if (params.write_branches && !iqtree.isSuperTree()) { + outWarning("-wpbl does not work with non-partition model"); + params.write_branches = false; + } + if (params.write_branches && params.partition_type == TOPO_UNLINKED) { + outWarning("-wpbl does not work with topology-unlinked partition model"); + params.write_branches = false; + } + if (params.write_branches) { + string filename = string(params.out_prefix) + ".branches.csv"; ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(rate_file); - out << "# Site-specific subtitution rates determined by "; - if (bayes) - out<< "empirical Bayesian method" << endl; - else - out<< "maximum likelihood" << endl; - out << "# This file can be read in MS Excel or in R with command:" << endl - << "# tab=read.table('" << rate_file << "',header=TRUE)" << endl - << "# Columns are tab-separated with following meaning:" << endl; - if (iqtree.isSuperTree()) { - out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)&iqtree)->front()->aln->name << ", etc)" << endl - << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; - } else - out << "# Site: Alignment site ID" << endl; - - if (bayes) - out << "# Rate: Posterior mean site rate weighted by posterior probability" << endl - << "# Cat: Category with highest posterior (0=invariable, 1=slow, etc)" << endl - << "# C_Rate: Corresponding rate of highest category" << endl; - else - out << "# Rate: Site rate estimated by maximum likelihood" << endl; - if (iqtree.isSuperTree()) - out << "Part\t"; - out << "Site\tRate"; - if (bayes) - out << "\tCat\tC_Rate" << endl; - else - out << endl; - iqtree.writeSiteRates(out, bayes); + out.open(filename.c_str()); + iqtree.writeBranches(out); out.close(); - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, rate_file); + cout << "Partition branch lengths written to " << filename << endl; } - cout << "Site rates printed to " << rate_file << endl; -} -void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { +/** site lh info */ + // -wsl and -wslm and -wslr and -wslmr options if (params.print_site_lh && !params.pll) { string site_lh_file = params.out_prefix; site_lh_file += ".sitelh"; @@ -2379,48 +2409,27 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { else printSiteLhCategory(site_lh_file.c_str(), &iqtree, params.print_site_lh); } - + // -wsp and -wspm and -wspr and -wspmr options + if (params.print_site_prob && !params.pll) { + printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob); + } + // -hmmster option if (params.optimize_params_use_hmm){ string hmm_file = string(params.out_prefix) + ".hmm"; int cat_assign_method = 0; // the categories along sites is assigned according to the path with maximum probability (default) printHMMResult(hmm_file.c_str(), &iqtree, cat_assign_method); - hmm_file = string(params.out_prefix) + ".pp.hmm"; cat_assign_method = 1; // the categories along sites is assigned according to the max posterior probability printHMMResult(hmm_file.c_str(), &iqtree, cat_assign_method); - if (params.print_marginal_prob) { string mp_file = params.out_prefix; mp_file += ".mprob"; printMarginalProb(mp_file.c_str(), &iqtree); } } - - if (params.print_partition_lh && !iqtree.isSuperTree()) { - outWarning("-wpl does not work with non-partition model"); - params.print_partition_lh = false; - } - if (params.print_partition_lh && !params.pll) { - string part_lh_file = (string)params.out_prefix + ".partlh"; - printPartitionLh(part_lh_file.c_str(), &iqtree, pattern_lh); - } - - if (params.print_site_prob && !params.pll) { - printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob); - } - - if (params.print_ancestral_sequence) { - printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence); - } - - if (params.print_site_state_freq != WSF_NONE && !params.site_freq_file && !params.tree_freq_file) { - string site_freq_file = params.out_prefix; - site_freq_file += ".sitesf"; - printSiteStateFreq(site_freq_file.c_str(), &iqtree); - } - + // -wsptrees option if (params.print_trees_site_posterior) { cout << "Computing mixture posterior probabilities" << endl; IntVector pattern_cat; @@ -2436,14 +2445,12 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << pattern_cat[ptn] << endl; out.close(); cout << "Pattern mixtures printed to " << site_mix_file << endl; - site_mix_file = (string)params.out_prefix + ".sitemixall"; out.open(site_mix_file.c_str()); int ncat = iqtree.getRate()->getNRate(); if (iqtree.getModel()->isMixture() && !iqtree.getModelFactory()->fused_mix_rate) ncat = iqtree.getModel()->getNMixtures(); out << "Ptn\tFreq\tNumMix\tCat" << endl; - int c; for (ptn = 0; ptn < iqtree.ptn_cat_mask.size(); ptn++) { int num_cat = popcount_lauradoux((unsigned*)&iqtree.ptn_cat_mask[ptn], 2); @@ -2458,9 +2465,19 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { out.close(); } +/** branch length info */ + // -wbl option + if (params.print_branch_lengths && iqtree.isTreeMix()) { + outWarning("-wbl does not work with tree mixture model"); + params.print_branch_lengths = false; + } + if (params.print_branch_lengths && params.partition_type == TOPO_UNLINKED) { + outWarning("-wbl does not work with topology-unlinked partition model"); + params.print_branch_lengths = false; + } if (params.print_branch_lengths) { if (params.manuel_analytic_approx) { - cout << "Applying Manuel's analytic approximation.." << endl; + cout << "Applying Manuel's analytic approximation..." << endl; iqtree.approxAllBranches(); } string brlen_file = params.out_prefix; @@ -2471,33 +2488,42 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { out.close(); cout << "Branch lengths written to " << brlen_file << endl; } + // -blscale option + if (params.fixed_branch_length == BRLEN_SCALE) { + string filename = (string)params.out_prefix + ".blscale"; + iqtree.printTreeLengthScaling(filename.c_str()); + cout << "Scaled tree length and model parameters printed to " << filename << endl; + } - if (params.write_branches) { - string filename = string(params.out_prefix) + ".branches.csv"; - ofstream out; - out.open(filename.c_str()); - iqtree.writeBranches(out); - out.close(); - cout << "Branch lengths written to " << filename << endl; +/** ancestral sequence info */ + // -asr and -asr-joint options + if (params.print_ancestral_sequence && iqtree.isTreeMix()) { + outWarning("-asr does not work with tree mixture model"); + params.print_ancestral_sequence = AST_NONE; } - - if (params.print_conaln && iqtree.isSuperTree()) { - string str = params.out_prefix; - str = params.out_prefix; - str += ".conaln"; - iqtree.aln->printAlignment(params.aln_output_format, str.c_str()); + if (params.print_ancestral_sequence && params.partition_type == TOPO_UNLINKED) { + outWarning("-asr does not work with topology-unlinked partition model"); + params.print_ancestral_sequence = AST_NONE; } - - if (params.print_partition_info && iqtree.isSuperTree()) { - ASSERT(params.print_conaln); - string aln_file = (string)params.out_prefix + ".conaln"; - string partition_info = params.out_prefix; - partition_info += ".partinfo.nex"; - ((SuperAlignment*)(iqtree.aln))->printPartition(partition_info.c_str(), aln_file.c_str()); - partition_info = (string)params.out_prefix + ".partitions"; - ((SuperAlignment*)(iqtree.aln))->printPartitionRaxml(partition_info.c_str()); + if (params.print_ancestral_sequence) { + printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence); } +/** site state frequency and rate info */ + // -wsf option + if (params.print_site_state_freq && !params.site_freq_file && !params.tree_freq_file) { + printSiteStateFreq(((string)params.out_prefix + ".freq").c_str(), &iqtree); + } + // -wsr and --mlrate options + if (params.print_site_rate && !params.site_rate_file && !params.tree_rate_file) { + if (iqtree.isTreeMix()) + outError("Inferring site rates is not supported for tree mixture model, pls contact the developers"); + if (params.print_site_rate & 1) + printSiteRate(((string)params.out_prefix + ".rate").c_str(), &iqtree, true); + if (params.print_site_rate & 2) + printSiteRate(((string)params.out_prefix + ".mlrate").c_str(), &iqtree, false); + } + // -mh and -mhs options if (params.mvh_site_rate) { RateMeyerHaeseler *rate_mvh = new RateMeyerHaeseler(params.rate_file, &iqtree, params.rate_mh_type); @@ -2511,41 +2537,21 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { ofstream out; out.exceptions(ios::failbit | ios::badbit); out.open(mhrate_file.c_str()); - iqtree.writeSiteRates(out, true); + iqtree.writeSiteRates(out, false); out.close(); } catch (ios::failure) { outError(ERR_WRITE_OUTPUT, mhrate_file); } - if (params.print_site_lh) { string site_lh_file = params.out_prefix; site_lh_file += ".mhsitelh"; printSiteLh(site_lh_file.c_str(), &iqtree); } } - - if (params.print_site_rate & 1) { - string rate_file = params.out_prefix; - rate_file += ".rate"; - printSiteRates(iqtree, rate_file.c_str(), true); - } - - if (params.print_site_rate & 2) { - string rate_file = params.out_prefix; - rate_file += ".mlrate"; - printSiteRates(iqtree, rate_file.c_str(), false); - } - - if (params.fixed_branch_length == BRLEN_SCALE) { - string filename = (string)params.out_prefix + ".blscale"; - iqtree.printTreeLengthScaling(filename.c_str()); - cout << "Scaled tree length and model parameters printed to " << filename << endl; - } - } void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time, double search_real_time) { - + cout << endl; if (iqtree.isTreeMix()) { cout << "Total tree lengths:"; IQTreeMix* treemix = (IQTreeMix*) &iqtree; @@ -2556,7 +2562,6 @@ void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time } else { cout << "Total tree length: " << iqtree.treeLength() << endl; } - if (iqtree.isSuperTree() && verbose_mode >= VB_MAX) { PhyloSuperTree *stree = (PhyloSuperTree*) &iqtree; cout << stree->evalNNIs << " NNIs evaluated from " << stree->totalNNIs << " all possible NNIs ( " << @@ -2569,9 +2574,7 @@ void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time << " %)" << endl; } } - params.run_time = (getCPUTime() - params.startCPUTime); - cout << endl; cout << "Total number of iterations: " << iqtree.stop_rule.getCurIt() << endl; // cout << "Total number of partial likelihood vector computations: " << iqtree.num_partial_lh_computations << endl; cout << "CPU time used for tree search: " << search_cpu_time @@ -2583,7 +2586,6 @@ void printFinalSearchInfo(Params ¶ms, IQTree &iqtree, double search_cpu_time cout << "Total wall-clock time used: " << getRealTime() - params.start_real_time << " sec (" << convert_time(getRealTime() - params.start_real_time) << ")" << endl; - } void printTrees(vector trees, Params ¶ms, string suffix) { @@ -2647,12 +2649,14 @@ void startTreeReconstruction(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &m params.startCPUTime = getCPUTime(); params.start_real_time = getRealTime(); - string best_subst_name, best_rate_name; - runModelFinder(params, *iqtree, model_info, best_subst_name, best_rate_name); - - optimiseQMixModel(params, iqtree, model_info); + if (!iqtree->isTreeMix()) { + string best_subst_name, best_rate_name; + runModelFinder(params, *iqtree, model_info, best_subst_name, best_rate_name); + + optimiseQMixModel(params, iqtree, model_info); + } } - + /** optimize branch lengths of consensus tree */ @@ -3750,9 +3754,12 @@ void runStandardBootstrap(Params ¶ms, Alignment *alignment, IQTree *tree) { bootstrap_alignment->printAlignment(params.aln_output_format, bootaln_name.c_str(), true); } - if (params.print_boot_site_freq && MPIHelper::getInstance().isMaster()) { - printSiteStateFreq((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment); - bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str()); + if ((params.print_boot_site_freq || params.print_boot_site_rate) && MPIHelper::getInstance().isMaster()) { + if (params.print_boot_site_freq) + printSiteParam((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment, "freq"); + if (params.print_boot_site_rate) + printSiteParam((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsiterate").c_str(), bootstrap_alignment, "rate"); + bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str()); } if (!tree->constraintTree.empty()) { @@ -3955,62 +3962,88 @@ void convertAlignment(Params ¶ms, IQTree *iqtree) { /** 2016-08-04: compute a site frequency model for profile mixture model */ -void computeSiteFrequencyModel(Params ¶ms, Alignment *alignment) { - - cout << endl << "===> COMPUTING SITE FREQUENCY MODEL BASED ON TREE FILE " << params.tree_freq_file << endl; - ASSERT(params.tree_freq_file); - PhyloTree *tree = new PhyloTree(alignment); - tree->setParams(¶ms); - bool myrooted = params.is_rooted; - tree->readTree(params.tree_freq_file, myrooted); - tree->setAlignment(alignment); - tree->setRootNode(params.root); - - ModelsBlock *models_block = readModelsDefinition(params); - tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block)); - delete models_block; - tree->setModel(tree->getModelFactory()->model); - tree->setRate(tree->getModelFactory()->site_rate); - tree->setLikelihoodKernel(params.SSE); - tree->setNumThreads(params.num_threads); - - if (!tree->getModel()->isMixture()) - outError("No mixture model was specified!"); - uint64_t mem_size = tree->getMemoryRequired(); - uint64_t total_mem = getMemorySize(); - cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl; - if (mem_size >= total_mem) { - outError("Memory required exceeds your computer RAM size!"); - } +void computeSiteSpecificModel(Params ¶ms, Alignment *alignment, const string ¶m_type) +{ + ASSERT((param_type == "freq" && params.tree_freq_file) || + (param_type == "rate" && params.tree_rate_file)); + string msg = (param_type == "freq") ? "FREQUENCY" : "RATE"; + char *filename = (param_type == "freq") ? params.tree_freq_file : params.tree_rate_file; + cout << endl << "===> COMPUTING SITE " << msg << " MODEL BASED ON TREE FILE " << filename << endl; + // init auxiliary tree, model, etc. + PhyloTree *tree = new PhyloTree(alignment); + tree->setParams(¶ms); + tree->setLikelihoodKernel(params.SSE); + tree->setNumThreads(params.num_threads); + bool myrooted = params.is_rooted; + tree->readTree(filename, myrooted); + tree->setRootNode(params.root); + tree->setAlignment(alignment); + ModelsBlock *models_block = readModelsDefinition(params); + tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block)); + delete models_block; + // check model compatibility + if (!tree->getModel()->isReversible()) + outError("Non-reversible models are incompatible with site-specific models"); + if (tree->getModel()->isMixture() && !tree->getModel()->isMixtureSameQ()) + outError("Matrix mixture models are incompatible with site-specific models. Use -wsf or -wsr options to estimate site-specific parameters"); + if (tree->getModel()->isFused()) + outError("Unlinked rate mixture models are incompatible with site-specific models. Use -wsf or -wsr options to estimate site-specific parameters"); + if (param_type == "rate" && tree->getModel()->isMixture()) + outError("Frequency mixture models are incompatible with site-specific models. Use -wsr option to estimate site-specific rates"); + // check for the starting mixture model + if (param_type == "freq" && !tree->getModel()->isMixture()) + outError("No frequency mixture model was specified!"); + if (param_type == "rate" && !tree->getRate()->isMixture()) + outError("No rate mixture model was specified!"); + uint64_t mem_size = tree->getMemoryRequired(); + uint64_t total_mem = getMemorySize(); + cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl; + if (mem_size >= total_mem) + outError("Memory required exceeds your computer RAM size!"); #ifdef BINARY32 - if (mem_size >= 2000000000) { - outError("Memory required exceeds 2GB limit of 32-bit executable"); - } + if (mem_size >= 2000000000) + outError("Memory required exceeds 2GB limit of 32-bit executable"); #endif - - tree->ensureNumberOfThreadsIsSet(nullptr); - - tree->initializeAllPartialLh(); - // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation - tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modelEps*10); - - size_t nptn = alignment->getNPattern(), nstates = alignment->num_states; - double *ptn_state_freq = new double[nptn*nstates]; - tree->computePatternStateFreq(ptn_state_freq); - alignment->site_state_freq.resize(nptn); - for (size_t ptn = 0; ptn < nptn; ptn++) { - double *f = new double[nstates]; - memcpy(f, ptn_state_freq+ptn*nstates, sizeof(double)*nstates); - alignment->site_state_freq[ptn] = f; - } - alignment->getSitePatternIndex(alignment->site_model); - printSiteStateFreq(((string)params.out_prefix+".sitefreq").c_str(), tree, ptn_state_freq); - params.print_site_state_freq = WSF_NONE; - - delete [] ptn_state_freq; - delete tree; - - cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE FREQUENCY MODEL" << endl; + tree->ensureNumberOfThreadsIsSet(nullptr); + tree->initializeAllPartialLh(); + // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation + double modelEpsilon = (param_type == "freq") ? params.modelEps * 10.0 : params.modelEps; + cout << "Estimate initial model parameters (epsilon = " << modelEpsilon << ")" << endl; + tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, modelEpsilon); + // compute state freqs or rate scalers for all the alignment patterns + size_t nptn = alignment->getNPattern(); + if (param_type == "freq") { + double *all_ptn_state_freq = nullptr; + tree->computePatternStateFreq(all_ptn_state_freq); + ASSERT(all_ptn_state_freq); + size_t nstates = alignment->num_states; + for (size_t ptn = 0; ptn < nptn; ptn++) { + double *state_freqs = new double[nstates]; + memcpy(state_freqs, all_ptn_state_freq + ptn*nstates, sizeof(double)*nstates); + alignment->convfreq(state_freqs); // regularize freqs + alignment->ptn_state_freq.push_back(state_freqs); + } + delete [] all_ptn_state_freq; + ASSERT(alignment->ptn_state_freq.size() == nptn); + params.site_state_freq_type = WSF_NONE; + } else { + DoubleVector ptn_rate; + tree->computePatternRate(ptn_rate); + ASSERT(ptn_rate.size()); + for (size_t ptn = 0; ptn < nptn; ptn++) { + double rate = ptn_rate[ptn]; + rate = min(max(rate, MIN_SITE_RATE), MAX_SITE_RATE); // regularize rate + alignment->ptn_rate_scaler.push_back(rate); + } + ASSERT(alignment->ptn_rate_scaler.size() == nptn); + params.site_rate_type = WSR_NONE; + } + delete tree; + // print the computed site-specific params into a file + string out_suffix = (param_type == "freq") ? ".sitefreq" : ".siterate"; + printSiteParam(((string)params.out_prefix + out_suffix).c_str(), alignment, param_type); + // continue analysis using the alignment with the computed site-specific params + cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE " << msg << " MODEL" << endl; } @@ -4071,7 +4104,12 @@ int checkCharInFile(char* infile, char c) { IQTree *newIQTreeMix(Params ¶ms, Alignment *alignment, int numTree = 0) { int i, k; vector trees; - + + if (alignment->isSuperAlignment()) { + outError("Tree mixture model does not work with partition models"); + } else if (posRateHeterotachy(alignment->model_name) != string::npos || params.num_mixlen > 1) { + outError("Tree mixture model does not work with heterotachy model"); + } if (numTree == 1) { outError("The number after +T has to be greater than 1"); } @@ -4342,36 +4380,47 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali alignment = new SuperAlignmentUnlinked(params); else alignment = new SuperAlignment(params); + if (params.tree_freq_file || params.site_freq_file || params.tree_rate_file || params.site_rate_file) + outError("Partition models do not work with site-specific models"); } else { alignment = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name); - if (params.freq_const_patterns) { int orig_nsite = alignment->getNSite(); alignment->addConstPatterns(params.freq_const_patterns); cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl; } - // Initialize site-frequency model if (params.tree_freq_file) { if (checkpoint->getBool("finishedSiteFreqFile")) { - alignment->readSiteStateFreq(((string)params.out_prefix + ".sitefreq").c_str()); - params.print_site_state_freq = WSF_NONE; + alignment->readSiteParamFile(((string)params.out_prefix + ".sitefreq").c_str(), "freq"); + params.site_state_freq_type = WSF_NONE; cout << "CHECKPOINT: Site frequency model restored" << endl; } else { - computeSiteFrequencyModel(params, alignment); + computeSiteSpecificModel(params, alignment, "freq"); checkpoint->putBool("finishedSiteFreqFile", true); checkpoint->dump(); } - } - if (params.site_freq_file) { - alignment->readSiteStateFreq(params.site_freq_file); + } else if (params.site_freq_file) { + alignment->readSiteParamFile(params.site_freq_file, "freq"); + } + // Initialize site-rate model + if (params.tree_rate_file) { + if (checkpoint->getBool("finishedSiteRateFile")) { + alignment->readSiteParamFile(((string)params.out_prefix + ".siterate").c_str(), "rate"); + params.site_rate_type = WSR_NONE; + cout << "CHECKPOINT: Site rate model restored" << endl; + } else { + computeSiteSpecificModel(params, alignment, "rate"); + checkpoint->putBool("finishedSiteRateFile", true); + checkpoint->dump(); + } + } else if (params.site_rate_file) { + alignment->readSiteParamFile(params.site_rate_file, "rate"); } } - if (params.symtest) { doSymTest(alignment, params); } - if (params.print_aln_info) { string site_info_file = string(params.out_prefix) + ".alninfo"; alignment->printSiteInfo(site_info_file.c_str()); @@ -4380,20 +4429,18 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali /*************** initialize tree ********************/ bool isTreeMix = isTreeMixture(params); - - if (params.optimize_params_use_hmm && !isTreeMix) { - outError("option '-hmmster' is only available for tree mixture model"); - } - + if (isTreeMix) { + cout << endl; if (params.optimize_params_use_hmm) cout << "HMMSTER "; // tree-mixture model cout << "Tree-mixture model" << endl; - if (params.compute_ml_tree_only) { + if (params.compute_ml_tree_only) outError("option compute_ml_tree_only cannot be set for tree-mixture model"); - } + if (params.user_file == NULL) + outError("To use tree-mixture model, use an option: -te "); // the minimum gamma shape should be greater than MIN_GAMMA_SHAPE_TREEMIX for tree mixture model if (params.min_gamma_shape < MIN_GAMMA_SHAPE_TREEMIX) { @@ -4402,10 +4449,6 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali params.min_gamma_shape = MIN_GAMMA_SHAPE_TREEMIX; } - if (params.user_file == NULL) { - outError("To use tree-mixture model, use an option: -te "); - } - // get the number after "+T" for tree-mixture model int treeNum = getTreeMixNum(params); if (treeNum == 0) { @@ -4415,6 +4458,9 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali } } else { + if (params.optimize_params_use_hmm) + outError("option '-hmmster' is only available for tree mixture model"); + tree = newIQTree(params, alignment); } diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 154ce378c..7000b0831 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -781,6 +781,7 @@ void transferModelFinderParameters(IQTree *iqtree, Checkpoint *target) { void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, string &best_subst_name, string &best_rate_name) { +/* if (params.model_name.find("+T") != string::npos) { // tree mixture return; @@ -808,6 +809,42 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, // Model already specifed, nothing to do here if (!empty_model_found && params.model_name.substr(0, 4) != "TEST" && params.model_name.substr(0, 2) != "MF") return; +*/ + if (params.model_name == "MIX+MFP" || params.model_name == "MIX+MF") return; + bool empty_model_found = params.model_name.empty() && !iqtree.isSuperTree(); + if (params.model_name.empty() && iqtree.isSuperTree()) { + // check whether any partition has empty model_name + PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree; + for (auto it = stree->begin(); it != stree->end(); it++) { + if ((*it)->aln->model_name.empty()) { + empty_model_found = true; + break; + } + } + } + if (params.model_joint) { + empty_model_found = false; + } + bool test = empty_model_found || + (params.model_name == "TEST") || (params.model_name == "TESTONLY") || + (params.model_name == "MFP") || (params.model_name == "MF"); + bool testmerge = (params.model_name == "TESTMERGE") || (params.model_name == "TESTMERGEONLY") || + (params.model_name == "MFP+MERGE") || (params.model_name == "MF+MERGE"); + if (testmerge && !iqtree.isSuperTree()) + outError("No partition file specified along with the option -m " + params.model_name); + bool test_only = (params.model_name == "TESTONLY") || (params.model_name == "MF") || + (params.model_name == "TESTMERGEONLY") || (params.model_name == "MF+MERGE"); + cout << endl; + if (!test && !testmerge) { + string model_name = (params.partition_file) ? string(params.partition_file) : params.model_name; + if (params.partition_file && params.model_name.size()) + model_name += " and " + params.model_name + " (for missing models)"; + cout << "Skip ModelFinder, model specified: " << model_name << endl; + return; + } + cout << "Running ModelFinder:" << endl; + if (params.site_freq_file || params.site_rate_file) + outError("ModelFinder does not work with site-specific models"); if (MPIHelper::getInstance().getNumProcesses() > 1) outError("Please use only 1 MPI process! We are currently working on the MPI parallelization of model selection."); // TODO: check if necessary @@ -823,7 +860,7 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, ok_model_file = model_info.load(); } - cout << endl; + //cout << endl; ok_model_file &= model_info.size() > 0; if (ok_model_file) @@ -925,14 +962,14 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, } delete models_block; - + // force to dump all checkpointing information model_info.dump(true); - + // transfer models parameters transferModelFinderParameters(&iqtree, orig_checkpoint); iqtree.setCheckpoint(orig_checkpoint); - + params.startCPUTime = cpu_time; params.start_real_time = real_time; cpu_time = getCPUTime() - cpu_time; @@ -941,7 +978,9 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, cout << "All model information printed to " << model_info.getFileName() << endl; cout << "CPU time for ModelFinder: " << cpu_time << " seconds (" << convert_time(cpu_time) << ")" << endl; cout << "Wall-clock time for ModelFinder: " << real_time << " seconds (" << convert_time(real_time) << ")" << endl; - + + cout << endl; + cout << "ModelFinder finished, model selected: " << iqtree.aln->model_name << endl; // alignment = iqtree.aln; if (test_only) { params.min_iterations = 0; @@ -3699,28 +3738,27 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck // Optimisation of Q-Mixture model, including estimation of best number of classes in the mixture void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_info) { - IQTree* new_iqtree; - string model_str; - - if (params.model_name.substr(0,6) != "MIX+MF") - return; - + bool test = (params.model_name == "MIX+MFP") || (params.model_name == "MIX+MF"); bool test_only = (params.model_name == "MIX+MF"); - params.model_name = ""; - + if (!test) { + return; + } if (MPIHelper::getInstance().getNumProcesses() > 1) outError("Error! The option -m '" + params.model_name + "' does not support MPI parallelization"); - if (iqtree->isSuperTree()) outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions"); - if (iqtree->aln->seq_type != SEQ_DNA) outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set"); + cout << endl; cout << "--------------------------------------------------------------------" << endl; cout << "| Optimizing Q-mixture model |" << endl; cout << "--------------------------------------------------------------------" << endl; + IQTree* new_iqtree; + string model_str; + + params.model_name = ""; // disable the bootstrapping int orig_gbo_replicates = params.gbo_replicates; ConsensusType orig_consensus_type = params.consensus_type; diff --git a/main/treetesting.cpp b/main/treetesting.cpp index 5158d9cb4..db0c7583b 100644 --- a/main/treetesting.cpp +++ b/main/treetesting.cpp @@ -137,21 +137,26 @@ void printPartitionLh(const char*filename, PhyloTree *tree, double *ptn_lh, delete[] pattern_lh; } -void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) { - +void printSiteLhCategory(const char *filename, PhyloTree *tree, SiteLoglType wsl) { + // error check if (wsl == WSL_NONE || wsl == WSL_SITE) return; + // tree mixture model if (tree->isTreeMix()) { - wsl = WSL_TMIXTURE; // tree mixture model - } else if (!tree->getModel()->isMixture()) { - if (wsl != WSL_RATECAT) { - wsl = WSL_RATECAT; - } - } else { - // mixture model + outWarning("Switching to tree categories, as it is the only option for tree mixture model"); + wsl = WSL_TMIXTURE; + // mixture model + } else if (tree->getModel()->isMixture()) { if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) { + outWarning("Switching to -wslm, as -wslmr is not suitable for fused mixture model"); wsl = WSL_MIXTURE; } + // non-mixture model + } else { + if (wsl != WSL_RATECAT) { + outWarning("Switching to -wslr, as it is the only option for non-mixture model"); + wsl = WSL_RATECAT; + } } int ncat = tree->getNumLhCat(wsl); if (tree->isSuperTree()) { @@ -163,8 +168,6 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) } } int i; - - try { ofstream out; out.exceptions(ios::failbit | ios::badbit); @@ -178,12 +181,10 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; } else out << "# Site: Alignment site ID" << endl; - out << "# LnL: Logarithm of site likelihood" << endl << "# Thus, sum of LnL is equal to tree log-likelihood" << endl << "# LnLW_k: Logarithm of (category-k site likelihood times category-k weight)" << endl << "# Thus, sum of exp(LnLW_k) is equal to exp(LnL)" << endl; - if (tree->isSuperTree()) { out << "Part\tSite\tLnL"; } else @@ -193,9 +194,7 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) out << endl; out.precision(4); out.setf(ios::fixed); - tree->writeSiteLh(out, wsl); - out.close(); cout << "Site log-likelihoods per category printed to " << filename << endl; /* @@ -216,11 +215,80 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) } catch (ios::failure) { outError(ERR_WRITE_OUTPUT, filename); } - +} + +void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) { + // error check + if (wsl == WSL_NONE || wsl == WSL_SITE) + return; + // tree mixture model + if (tree->isTreeMix()) { + outWarning("Switching to tree categories, as it is the only option for tree mixture model"); + wsl = WSL_TMIXTURE; + // mixture model + } else if (tree->getModel()->isMixture()) { + if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) { + outWarning("Switching to -wspm, as -wspmr is not suitable for fused mixture model"); + wsl = WSL_MIXTURE; + } + // non-mixture model + } else { + if (wsl != WSL_RATECAT) { + outWarning("Switching to -wspr, as it is the only option for non-mixture model"); + wsl = WSL_RATECAT; + } + } + size_t cat, ncat = tree->getNumLhCat(wsl); + double *ptn_prob_cat = new double[((size_t)tree->getAlnNPattern())*ncat]; + tree->computePatternProbabilityCategory(ptn_prob_cat, wsl); + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + if (tree->isSuperTree()) + out << "Set\t"; + out << "Site"; + for (cat = 0; cat < ncat; cat++) + out << "\tp" << cat+1; + out << endl; + IntVector pattern_index; + if (tree->isSuperTree()) { + PhyloSuperTree *super_tree = (PhyloSuperTree*)tree; + size_t offset = 0; + for (PhyloSuperTree::iterator it = super_tree->begin(); it != super_tree->end(); it++) { + size_t part_ncat = (*it)->getNumLhCat(wsl); + (*it)->aln->getSitePatternIndex(pattern_index); + size_t nsite = (*it)->aln->getNSite(); + for (size_t site = 0; site < nsite; ++site) { + out << (it-super_tree->begin())+1 << "\t" << site+1; + double *prob_cat = ptn_prob_cat + (offset+pattern_index[site]*part_ncat); + for (cat = 0; cat < part_ncat; cat++) + out << "\t" << prob_cat[cat]; + out << endl; + } + offset += (*it)->aln->getNPattern()*(*it)->getNumLhCat(wsl); + } + } else { + tree->aln->getSitePatternIndex(pattern_index); + size_t nsite = tree->getAlnNSite(); + for (size_t site = 0; site < nsite; ++site) { + out << site+1; + double *prob_cat = ptn_prob_cat + pattern_index[site]*ncat; + for (cat = 0; cat < ncat; cat++) { + out << "\t" << prob_cat[cat]; + } + out << endl; + } + } + out.close(); + cout << "Site probabilities per category printed to " << filename << endl; + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } } void printAncestralSequences(const char *out_prefix, PhyloTree *tree, AncestralSeqType ast) { - + cout << endl << "Infer ancestral site states" << endl; // int *joint_ancestral = NULL; // // if (tree->params->print_ancestral_sequence == AST_JOINT) { @@ -335,139 +403,126 @@ void printAncestralSequences(const char *out_prefix, PhyloTree *tree, AncestralS } -void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) { - - if (wsl == WSL_NONE || wsl == WSL_SITE) - return; - // error checking - if (tree->isTreeMix()) - wsl = WSL_TMIXTURE; // tree mixture model - else if (!tree->getModel()->isMixture()) { - if (wsl != WSL_RATECAT) { - outWarning("Switch now to '-wspr' as it is the only option for non-mixture model"); - wsl = WSL_RATECAT; - } - } else { - // mixture model - if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) { - outWarning("-wspmr is not suitable for fused mixture model, switch now to -wspm"); - wsl = WSL_MIXTURE; - } - } - size_t cat, ncat = tree->getNumLhCat(wsl); - double *ptn_prob_cat = new double[((size_t)tree->getAlnNPattern())*ncat]; - tree->computePatternProbabilityCategory(ptn_prob_cat, wsl); - - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(filename); - if (tree->isSuperTree()) - out << "Set\t"; - out << "Site"; - for (cat = 0; cat < ncat; cat++) - out << "\tp" << cat+1; - out << endl; - IntVector pattern_index; - if (tree->isSuperTree()) { - PhyloSuperTree *super_tree = (PhyloSuperTree*)tree; - size_t offset = 0; - for (PhyloSuperTree::iterator it = super_tree->begin(); it != super_tree->end(); it++) { - size_t part_ncat = (*it)->getNumLhCat(wsl); - (*it)->aln->getSitePatternIndex(pattern_index); - size_t nsite = (*it)->aln->getNSite(); - for (size_t site = 0; site < nsite; ++site) { - out << (it-super_tree->begin())+1 << "\t" << site+1; - double *prob_cat = ptn_prob_cat + (offset+pattern_index[site]*part_ncat); - for (cat = 0; cat < part_ncat; cat++) - out << "\t" << prob_cat[cat]; - out << endl; - } - offset += (*it)->aln->getNPattern()*(*it)->getNumLhCat(wsl); - } - } else { - tree->aln->getSitePatternIndex(pattern_index); - size_t nsite = tree->getAlnNSite(); - for (size_t site = 0; site < nsite; ++site) { - out << site+1; - double *prob_cat = ptn_prob_cat + pattern_index[site]*ncat; - for (cat = 0; cat < ncat; cat++) { - out << "\t" << prob_cat[cat]; - } - out << endl; - } - } - out.close(); - cout << "Site probabilities per category printed to " << filename << endl; - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, filename); - } - +void printSiteStateFreq(const char *filename, PhyloTree *tree) { + cout << endl << "Infer site state frequencies" << endl; + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + string msg; + // write the comment header + out << "# Site-specific state frequencies determined by empirical Bayesian method" << endl; + out << "# This file can be read in MS Excel or in R with command:" << endl + << "# tab=read.table('" << filename << "',header=TRUE,fill=TRUE)" << endl + << "# Columns are tab-separated with following meaning:" << endl; + if (tree->isSuperTree()) + out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)tree)->front()->aln->name << ", etc)" << endl + << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; + else + out << "# Site: Alignment site ID" << endl; + msg = (tree->params->site_state_freq_type == WSF_POSTERIOR_MEAN) ? "mean" : "max"; + out << "# pi_X: Posterior " << msg << " site frequency of state X" << endl + << "# Cat: Category with the highest posterior weight" << endl + << "# CatPP: Posterior probability of the highest weight category" << endl; + // write the main header + msg = ""; + size_t nstates; + if (tree->isSuperTree()) { + msg += "Part\tSite"; + nstates = ((SuperAlignment*)tree->aln)->max_num_states; + for (size_t x = 0; x < nstates; ++x) + msg += "\tpi_" + convertIntToString(x); + } else { + msg += "Site"; + nstates = tree->aln->num_states; + for (size_t x = 0; x < nstates; ++x) + msg += "\tpi_" + tree->aln->convertStateBackStr(x); + } + msg += "\tCat\tCatPP"; + out << msg << endl; + // infer and write the site freqs + tree->writeSiteFreqs(out); + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + cout << "Site state frequencies printed to " << filename << endl; } - -void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs) { - size_t nsites = tree->getAlnNSite(); - size_t nstates = tree->aln->num_states; - double *ptn_state_freq; - if (state_freqs) { - ptn_state_freq = state_freqs; - } else { - ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates]; - tree->computePatternStateFreq(ptn_state_freq); - } - - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(filename); - IntVector pattern_index; - tree->aln->getSitePatternIndex(pattern_index); - for (size_t i = 0; i < nsites; ++i) { - out.width(6); - out << left << i+1 << " "; - double *state_freq = &ptn_state_freq[pattern_index[i]*nstates]; - for (size_t j = 0; j < nstates; ++j) { - out.width(15); - out << state_freq[j] << " "; - } - out << endl; - } - out.close(); - cout << "Site state frequency vectors printed to " << filename << endl; - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, filename); - } - if (!state_freqs) - delete [] ptn_state_freq; +void printSiteRate(const char *filename, PhyloTree *tree, bool bayes) { + cout << endl << "Infer site rates" << endl; + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + string msg; + // write the comment header + msg = (bayes) ? "empirical Bayesian method" : "maximum likelihood"; + out << "# Site-specific substitution rates determined by " << msg << endl; + out << "# This file can be read in MS Excel or in R with command:" << endl + << "# tab=read.table('" << filename << "',header=TRUE,fill=TRUE)" << endl + << "# Columns are tab-separated with following meaning:" << endl; + if (tree->isSuperTree()) + out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)tree)->front()->aln->name << ", etc)" << endl + << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; + else + out << "# Site: Alignment site ID" << endl; + if (bayes) + out << "# Rate: Posterior mean site rate" << endl + << "# Cat: Category with the highest posterior weight (0=invariable, 1=slow, etc)" << endl + << "# CatRate: Rate of the highest weight category" << endl + << "# CatPP: Posterior probability of the highest weight category" << endl; + else + out << "# Rate: Site rate estimated by maximum likelihood" << endl; + // write the main header + msg = ""; + if (tree->isSuperTree()) msg += "Part\t"; + msg += "Site\tRate"; + if (bayes) msg += "\tCat\tCatRate\tCatPP"; + out << msg << endl; + // infer and write the site rates + tree->writeSiteRates(out, bayes); + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + cout << "Site rates printed to " << filename << endl; } -void printSiteStateFreq(const char* filename, Alignment *aln) { - if (aln->site_state_freq.empty()) - return; - size_t nsites = aln->getNSite(); - int nstates = aln->num_states; - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(filename); - IntVector pattern_index; - aln->getSitePatternIndex(pattern_index); - for (size_t i = 0; i < nsites; ++i) { - out.width(6); - out << left << i+1 << " "; - double *state_freq = aln->site_state_freq[pattern_index[i]]; - for (size_t j = 0; j < nstates; ++j) { - out.width(15); - out << state_freq[j] << " "; - } - out << endl; - } - out.close(); - cout << "Site state frequency vectors printed to " << filename << endl; - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, filename); - } +void printSiteParam(const char* filename, Alignment *aln, const string ¶m_type) { + ASSERT(param_type == "freq" || param_type == "rate"); + if (param_type == "freq" && !aln->isSSF()) return; + if (param_type == "rate" && !aln->isSSR()) return; + size_t nsites = aln->getNSite(); + int nstates = aln->num_states; + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + IntVector site_pattern; + aln->getSitePatternIndex(site_pattern); + for (size_t site = 0; site < nsites; ++site) { + out.width(6); + out << left << site + 1 << " "; + if (param_type == "freq") { + double *state_freqs = aln->ptn_state_freq[site_pattern[site]]; + for (size_t x = 0; x < nstates; ++x) { + out.width(15); + out << state_freqs[x] << " "; + } + } else { + double rate = aln->ptn_rate_scaler[site_pattern[site]]; + out.width(15); + out << rate << " "; + } + out << endl; + } + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + string msg = (param_type == "freq") ? "state frequency vectors" : "rate scalers"; + cout << "Site " << msg << " printed to " << filename << endl; } int countDistinctTrees(istream &in, bool rooted, IQTree *tree, IntVector &distinct_ids, bool exclude_duplicate) { diff --git a/main/treetesting.h b/main/treetesting.h index 2d1ae6851..9db338006 100644 --- a/main/treetesting.h +++ b/main/treetesting.h @@ -25,12 +25,12 @@ struct TreeInfo { double wkh_pvalue; // p-value by weighted Kishino-Hasegawa test double elw_value; // ELW - expected likelihood weights test bool elw_confident; // to represent confidence set of ELW test - double au_pvalue; // p-value by approximately unbiased (AU) test + double au_pvalue; // p-value by approximately unbiased (AU) test }; /** - * print site log likelihoods to a fileExists + * print site log likelihoods to a file * @param filename output file name * @param tree phylogenetic tree * @param ptn_lh pattern log-likelihoods, will be computed if NULL @@ -41,14 +41,14 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL, bool append = false, const char *linename = NULL); /** - * print HMM results to a fileExists + * print HMM results to a file * @param filename output file name * @param tree IQTreeMixHmm */ void printHMMResult(const char*filename, PhyloTree *tree, int cat_assign_method = 0); /** - * print marginal probabilities to a fileExists + * print marginal probabilities to a file * @param filename output file name * @param tree IQTreeMixHmm */ @@ -80,18 +80,27 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl); /** - * print site state frequency vectors (for Huaichun) + * print site state frequency vectors (for -wsf option) * @param filename output file name * @param tree phylogenetic tree -*/ -void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs = NULL); + */ +void printSiteStateFreq(const char*filename, PhyloTree *tree); /** - * print site state frequency vectors (for Huaichun) + * print site rates (for -wsr and --mlrate options) + * @param filename output file name + * @param tree phylogenetic tree + * @param bayes print mean posterior or ML rates + */ +void printSiteRate(const char *filename, PhyloTree *tree, bool bayes); + +/** + * print site state frequency vectors or rate scalers (for site-specific models) * @param filename output file name * @param aln alignment -*/ -void printSiteStateFreq(const char* filename, Alignment *aln); + * @param param_type should be set to "freq" or "rate" + */ +void printSiteParam(const char* filename, Alignment *aln, const string ¶m_type); /** print ancestral sequences diff --git a/model/modeldna.cpp b/model/modeldna.cpp index c10317c2f..398666a5d 100644 --- a/model/modeldna.cpp +++ b/model/modeldna.cpp @@ -202,8 +202,9 @@ void ModelDNA::init(const char *model_name, string model_params, StateFreqType f if (model_params != "") { readRates(model_params); } - - if (freq == FREQ_UNKNOWN || def_freq == FREQ_EQUAL) freq = def_freq; + if (def_freq == FREQ_EQUAL && phylo_tree->aln->isSSF()) + outError("DNA models with equal state frequencies are not suitable for site-specific frequencies"); + if (freq == FREQ_UNKNOWN || def_freq == FREQ_EQUAL) freq = def_freq; ModelMarkov::init(freq); // model_parameters = new double [getNDim()+1]; // see setVariables for explaination of +1 // setVariables(model_parameters); diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 5530e4ae2..faceffd5a 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -24,11 +24,13 @@ #include "modelmarkov.h" #include "modelliemarkov.h" #include "modeldna.h" +#include "modeldnaerror.h" #include "modelprotein.h" #include "modelbin.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelpomo.h" +#include "modelpomomixture.h" #include "modelset.h" #include "modelmixture.h" #include "ratemeyerhaeseler.h" @@ -114,19 +116,167 @@ ModelsBlock *readModelsDefinition(Params ¶ms) { } if (params.model_def_file) { - cout << "Reading model definition file " << params.model_def_file << " ... "; + cout << "Reading model definition file " << params.model_def_file << " ..." << endl; MyReader nexus(params.model_def_file); nexus.Add(models_block); MyToken token(nexus.inf); nexus.Execute(token); int num_model = 0, num_freq = 0; - for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++) + for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++) { if (it->second.flag & NM_FREQ) num_freq++; else num_model++; + } cout << num_model << " models and " << num_freq << " frequency vectors loaded" << endl; } return models_block; } +ModelSubst* createModel(string model_str, ModelsBlock *models_block, + StateFreqType freq_type, string freq_params, + PhyloTree* tree) +{ + ModelSubst *model = NULL; + //cout << "Numstates: " << tree->aln->num_states << endl; + string model_params; + NxsModel *nxsmodel = models_block->findModel(model_str); + if (nxsmodel) model_params = nxsmodel->description; + + // PoMo. + bool pomo = false; + string pomo_rate_str = ""; + string pomo_heterozygosity = ""; + string::size_type p_pos = posPOMO(model_str); + pomo = (p_pos != string::npos); + + if (pomo) { + if (model_str[p_pos+2] == '{') { + string::size_type close_bracket = model_str.find("}", p_pos); + if (close_bracket == string::npos) { + cout << "Model string: " << model_str << endl; + outError("No closing bracket in PoMo parameters."); + } + else { + pomo_heterozygosity = model_str.substr(p_pos+3,close_bracket-p_pos-3); + model_str = model_str.substr(0, p_pos) + model_str.substr(close_bracket+1); + } + } + else { + model_str = model_str.substr(0, p_pos) + model_str.substr(p_pos + 2); + } + + size_t pomo_rate_start_pos; + if ((pomo_rate_start_pos = model_str.find("+G")) != string::npos) { + size_t pomo_rate_end_pos = model_str.find_first_of("+*", pomo_rate_start_pos+1); + if (pomo_rate_end_pos == string::npos) { + pomo_rate_str = model_str.substr(pomo_rate_start_pos, model_str.length() - pomo_rate_start_pos); + model_str = model_str.substr(0, pomo_rate_start_pos); + } else { + pomo_rate_str = model_str.substr(pomo_rate_start_pos, pomo_rate_end_pos - pomo_rate_start_pos); + model_str = model_str.substr(0, pomo_rate_start_pos) + model_str.substr(pomo_rate_end_pos); + } + } + } + + // sequencing error model + string seqerr = ""; + string::size_type spec_pos; + while ((spec_pos = model_str.find("+E")) != string::npos) { + string::size_type end_pos = model_str.find_first_of("+*", spec_pos+1); + if (end_pos == string::npos) { + seqerr = model_str.substr(spec_pos); + model_str = model_str.substr(0, spec_pos); + } else { + seqerr = model_str.substr(spec_pos, end_pos - spec_pos); + model_str = model_str.substr(0, spec_pos) + model_str.substr(end_pos); + } + } + + if (!seqerr.empty() && tree->aln->seq_type != SEQ_DNA) { + outError("Sequencing error model " + seqerr + " is only supported for DNA"); + } + // Now that PoMo stuff has been removed, check for model parameters. + size_t pos = model_str.find(OPEN_BRACKET); + if (pos != string::npos) { + if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) + outError("Close bracket not found at the end of ", model_str); + string tmp_str = model_str; + // extract model_name + model_str = model_str.substr(0, pos); + + // extract model params + size_t end_pos = tmp_str.find(CLOSE_BRACKET); + + // handle cases that user doesn't specify model parameters but supply state frequencies + size_t pos_plus = model_str.find('+'); + if (pos_plus != string::npos) + { + model_params = ""; + model_str = model_str.substr(0, pos_plus); + } + else + model_params = tmp_str.substr(pos+1, end_pos-pos-1); + + // extract freqs (if specified) + pos = tmp_str.find("+FQ"); + if (pos != string::npos) + freq_type = FREQ_EQUAL; + pos = tmp_str.find("+F{"); + if (pos != string::npos) + { + freq_type = FREQ_USER_DEFINED; + tmp_str = tmp_str.substr(pos+3, tmp_str.length()-pos-3); + end_pos = tmp_str.find(CLOSE_BRACKET); + freq_params = tmp_str.substr(0, end_pos); + } + } + + /* + if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || + (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || + (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || + (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || + (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) + { + model = new ModelSubst(tree->aln->num_states); + } else */ + + if ((pomo) || (tree->aln->seq_type == SEQ_POMO)) { + if (pomo_rate_str == "") + model = new ModelPoMo(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity); + else + model = new ModelPoMoMixture(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity, pomo_rate_str); + if (model->isMixture() && verbose_mode >= VB_MED) + cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; +// else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || +// (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || +// (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { +// model = new ModelGTR(tree, count_rates); +// if (freq_params != "") +// ((ModelGTR*)model)->readStateFreq(freq_params); +// if (model_params != "") +// ((ModelGTR*)model)->readRates(model_params); +// ((ModelGTR*)model)->init(freq_type); +// } else + } else if (ModelMarkov::validModelName(model_str)) { + model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); + } else if (tree->aln->seq_type == SEQ_BINARY) { + model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_DNA) { + if (seqerr.empty()) + model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); + else + model = new ModelDNAError(model_str.c_str(), model_params, freq_type, freq_params, seqerr, tree); + } else if (tree->aln->seq_type == SEQ_PROTEIN) { + model = new ModelProtein(model_str.c_str(), model_params, freq_type, freq_params, tree, models_block); + } else if (tree->aln->seq_type == SEQ_CODON) { + model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_MORPH) { + model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else { + outError("Unsupported model type"); + } + return model; +} + ModelFactory::ModelFactory() : CheckpointFactory() { model = NULL; site_rate = NULL; @@ -198,7 +348,8 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, mix_pos = next_mix_pos; } if (new_model_str != model_str) - cout << "Model " << model_str << " is alias for " << new_model_str << endl; + if (verbose_mode >= VB_MIN) + cout << "Model " << model_str << " is alias for " << new_model_str << endl; model_str = new_model_str; // nxsmodel = models_block->findModel(model_str); @@ -397,7 +548,6 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, case SEQ_PROTEIN: break; // let ModelProtein decide by itself case SEQ_MORPH: freq_type = FREQ_EQUAL; break; case SEQ_CODON: freq_type = FREQ_UNKNOWN; break; - break; default: freq_type = FREQ_EMPIRICAL; break; // default for DNA, PoMo and others: counted frequencies from alignment } } @@ -567,12 +717,15 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, /******************** initialize model ****************************/ - if (tree->aln->site_state_freq.empty()) { - if (model_str.length() >= 4 && model_str.substr(0, 4) == "MIX{") { - string model_list; + if (tree->aln->isSSR() && params.fixed_branch_length == BRLEN_FIX) + outError("-blfix option is incompatible with site-specific rates"); + + if (!tree->aln->isSSF() && !tree->aln->isSSR()) { + // simple or mixture model + if (model_str.substr(0, 4) == "MIX{") { if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) outError("Close bracket not found at the end of ", model_str); - model_list = model_str.substr(4, model_str.length()-5); + string model_list = model_str.substr(4, model_str.length()-5); model_str = model_str.substr(0, 3); model = new ModelMixture(model_name, model_str, model_list, models_block, freq_type, freq_params, tree, optimize_mixmodel_weight); } else if (freq_type == FREQ_MIXTURE) { @@ -587,40 +740,20 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // fused_mix_rate &= model->isMixture() && site_rate->getNRate() > 1; } else { // site-specific model - if (model_str == "JC" || model_str == "POISSON") - outError("JC is not suitable for site-specific model"); - model = new ModelSet(model_str.c_str(), tree); - ModelSet *models = (ModelSet*)model; // assign pointer for convenience - models->init((params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL); - models->pattern_model_map.resize(tree->aln->getNPattern(), -1); - for (size_t i = 0; i < tree->aln->getNSite(); ++i) { - models->pattern_model_map[tree->aln->getPatternID(i)] = tree->aln->site_model[i]; - //cout << "site " << i << " ptn " << tree->aln->getPatternID(i) << " -> model " << site_model[i] << endl; - } - double *state_freq = new double[model->num_states]; - double *rates = new double[model->getNumRateEntries()]; - for (size_t i = 0; i < tree->aln->site_state_freq.size(); ++i) { - ModelMarkov *modeli; - if (i == 0) { - modeli = (ModelMarkov*)createModel(model_str, models_block, (params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL, "", tree); - modeli->getStateFrequency(state_freq); - modeli->getRateMatrix(rates); - } else { - modeli = (ModelMarkov*)createModel(model_str, models_block, FREQ_EQUAL, "", tree); - modeli->setStateFrequency(state_freq); - modeli->setRateMatrix(rates); - } - if (tree->aln->site_state_freq[i]) - modeli->setStateFrequency (tree->aln->site_state_freq[i]); - - modeli->init(FREQ_USER_DEFINED); - models->push_back(modeli); + if (model_str.substr(0, 4) == "MIX{") + outError("Matrix mixture models are incompatible with site-specific models"); + if (freq_type == FREQ_MIXTURE && !params.tree_freq_file) + outError("State frequency mixture models are incompatible with site-specific models"); + if (params.tree_freq_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Switching frequency mixture model to SSF model" << endl; + freq_type = FREQ_EQUAL; + freq_params = ""; + } else if (params.site_freq_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Using SSF model" << endl; } - delete [] rates; - delete [] state_freq; - - models->joinEigenMemory(); - models->decomposeRateMatrix(); + model = new ModelSet(model_str, models_block, freq_type, freq_params, tree); // delete information of the old alignment // tree->aln->ordered_pattern.clear(); @@ -721,14 +854,16 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // choose discrete/continuous gamma if (posG != string::npos && posGC != string::npos) { - cout << "NOTE: both +G and +GC were specified, continue with " - << ((posG < posGC)? rate_str.substr(posG,2) : rate_str.substr(posGC,3)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +G and +GC were specified, continue with " + << ((posG < posGC)? rate_str.substr(posG,2) : rate_str.substr(posGC,3)) << endl; is_continuous_gamma = posGC < posG; } if (posG2 != string::npos && posGC != string::npos) { - cout << "NOTE: both *G and +GC were specified, continue with " - << ((posG2 < posGC)? rate_str.substr(posG2,2) : rate_str.substr(posGC,3)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both *G and +GC were specified, continue with " + << ((posG2 < posGC)? rate_str.substr(posG2,2) : rate_str.substr(posGC,3)) << endl; is_continuous_gamma = posGC < posG2; } @@ -746,8 +881,9 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, else { if (posG != string::npos && posG2 != string::npos) { - cout << "NOTE: both +G and *G were specified, continue with " - << ((posG < posG2)? rate_str.substr(posG,2) : rate_str.substr(posG2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +G and *G were specified, continue with " + << ((posG < posG2)? rate_str.substr(posG,2) : rate_str.substr(posG2,2)) << endl; } if (posG2 != string::npos && posG2 < posG) { posG = posG2; @@ -759,14 +895,16 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, string::size_type posR2 = rate_str.find("*R"); // FreeRate model if (posG != string::npos && (posR != string::npos || posR2 != string::npos)) { - outWarning("Both Gamma and FreeRate models were specified, continue with FreeRate model"); + if (verbose_mode >= VB_MIN) + outWarning("Both Gamma and FreeRate models were specified, continue with FreeRate model"); posG = string::npos; fused_mix_rate = false; } if (posR != string::npos && posR2 != string::npos) { - cout << "NOTE: both +R and *R were specified, continue with " - << ((posR < posR2)? rate_str.substr(posR,2) : rate_str.substr(posR2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +R and *R were specified, continue with " + << ((posR < posR2)? rate_str.substr(posR,2) : rate_str.substr(posR2,2)) << endl; } if (posR2 != string::npos && posR2 < posR) { @@ -778,20 +916,23 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, string::size_type posH2 = rate_str.find("*H"); // heterotachy model if (posG != string::npos && (posH != string::npos || posH2 != string::npos)) { - outWarning("Both Gamma and heterotachy models were specified, continue with heterotachy model"); + if (verbose_mode >= VB_MIN) + outWarning("Both Gamma and heterotachy models were specified, continue with heterotachy model"); posG = string::npos; fused_mix_rate = false; } if (posR != string::npos && (posH != string::npos || posH2 != string::npos)) { - outWarning("Both FreeRate and heterotachy models were specified, continue with heterotachy model"); + if (verbose_mode >= VB_MIN) + outWarning("Both FreeRate and heterotachy models were specified, continue with heterotachy model"); posR = string::npos; fused_mix_rate = false; } if (posH != string::npos && posH2 != string::npos) { - cout << "NOTE: both +H and *H were specified, continue with " - << ((posH < posH2)? rate_str.substr(posH,2) : rate_str.substr(posH2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +H and *H were specified, continue with " + << ((posH < posH2)? rate_str.substr(posH,2) : rate_str.substr(posH2,2)) << endl; } if (posH2 != string::npos && posH2 < posH) { posH = posH2; @@ -882,8 +1023,11 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, outError("Wrong model name ", rate_str); } - + bool rate_mixture = false; if (rate_str.find('+') != string::npos || rate_str.find('*') != string::npos) { + rate_mixture = true; + } + if (!tree->aln->isSSR() && rate_mixture) { //string rate_str = model_str.substr(pos); if (posI != string::npos && posH != string::npos) { site_rate = new RateHeterotachyInvar(num_rate_cats, heterotachy_params, p_invar_sites, tree); @@ -955,11 +1099,21 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // else // model_str = model_str.substr(0, model_str.find('*')); } else { - site_rate = new RateHeterogeneity(); - site_rate->setTree(tree); + if (rate_mixture && !params.tree_rate_file) + outError("Rate or branch mixture models are incompatible with site-specific rates"); + if (params.tree_rate_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Switching rate mixture model to SSR model" << endl; + } else if (params.site_rate_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Using SSR model" << endl; + } + site_rate = new RateHeterogeneity(tree); } if (fused_mix_rate) { + if (tree->aln->isSSF()) + outError("Unlinked rate or branch mixture models are incompatible with site-specific frequencies"); if (!model->isMixture()) { if (verbose_mode >= VB_MED) cout << endl << "NOTE: Using mixture model with unlinked " << model_str << " parameters" << endl; diff --git a/model/modelfactory.h b/model/modelfactory.h index 1bb598b0c..a77c85281 100644 --- a/model/modelfactory.h +++ b/model/modelfactory.h @@ -54,7 +54,21 @@ string::size_type posRateFree(string &model_name); string::size_type posPOMO(string &model_name); /** -Store the transition matrix corresponding to evolutionary time so that one must not compute again. + * create a substitution model + * @param model_str model name + * @param models_block + * @param freq_type state frequency type + * @param freq_params frequency parameters + * @param tree associated phylo tree + * @return substitution model created + */ +ModelSubst *createModel(string model_str, ModelsBlock *models_block, + StateFreqType freq_type, string freq_params, + PhyloTree *tree); + + +/** +Store the transition matrix corresponding to evolutionary time so that one must not compute again. For efficiency purpose esp. for protein (20x20) or codon (61x61). The values of the map contain 3 matricies consecutively: transition matrix, 1st, and 2nd derivative @@ -210,7 +224,6 @@ class ModelFactory : public unordered_map, public Optimization, pu */ ModelSubst *model; - /** pointer to the site-rate heterogeneity, will not be deleted when deleting ModelFactory object */ @@ -279,7 +292,6 @@ class ModelFactory : public unordered_map, public Optimization, pu protected: - /** this function is served for the multi-dimension optimization. It should pack the model parameters into a vector that is index from 1 (NOTE: not from 0) @@ -295,8 +307,9 @@ class ModelFactory : public unordered_map, public Optimization, pu */ virtual bool getVariables(double *variables); - vector optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon, - double initPInv, double initAlpha, DoubleVector &lenvec, Checkpoint *model_ckp); + vector optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon, + double initPInv, double initAlpha, DoubleVector &lenvec, Checkpoint *model_ckp); + }; #endif diff --git a/model/modelmarkov.cpp b/model/modelmarkov.cpp index 067c19922..7605cb60d 100644 --- a/model/modelmarkov.cpp +++ b/model/modelmarkov.cpp @@ -1138,29 +1138,32 @@ bool ModelMarkov::isUnstableParameters() { void ModelMarkov::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) { // ASSERT(is_reversible && "setBounds should only be called on subclass of ModelMarkov"); - - int i, ndim = getNDim(); - - for (i = 1; i <= ndim; i++) { - //cout << variables[i] << endl; - lower_bound[i] = MIN_RATE; - upper_bound[i] = MAX_RATE; - bound_check[i] = false; - } - + int ndim = getNDim(); + for (int i = 1; i <= ndim; i++) { + //cout << variables[i] << endl; + lower_bound[i] = MIN_RATE; + upper_bound[i] = MAX_RATE; + bound_check[i] = false; + } if (is_reversible && freq_type == FREQ_ESTIMATE) { - for (i = num_params+1; i <= num_params+num_states-1; i++) { -// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state]; + // find the highest state freq + for (int i = 0; i < num_states; i++) { + if (state_freq[i] > state_freq[highest_freq_state]) { + highest_freq_state = i; + } + } + for (int i = num_params + 1; i <= num_params + num_states - 1; i++) { +// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state]; // upper_bound[i] = state_freq[highest_freq_state]/MIN_FREQUENCY; - lower_bound[i] = Params::getInstance().min_state_freq; -// upper_bound[i] = 100.0; - upper_bound[i] = 1.0; - bound_check[i] = false; - } + lower_bound[i] = Params::getInstance().min_state_freq; +// upper_bound[i] = 100.0; + upper_bound[i] = 1.0; + bound_check[i] = false; + } } else if (phylo_tree->aln->seq_type == SEQ_DNA) { - setBoundsForFreqType(&lower_bound[num_params+1], &upper_bound[num_params+1], - &bound_check[num_params+1], Params::getInstance().min_state_freq, freq_type); - } + setBoundsForFreqType(&lower_bound[num_params+1], &upper_bound[num_params+1], + &bound_check[num_params+1], Params::getInstance().min_state_freq, freq_type); + } } double ModelMarkov::optimizeParameters(double gradient_epsilon) { @@ -1185,11 +1188,12 @@ double ModelMarkov::optimizeParameters(double gradient_epsilon) { bool *bound_check = new bool[ndim+1]; double score; - for (int i = 0; i < num_states; i++) { - if (state_freq[i] > state_freq[highest_freq_state]) { - highest_freq_state = i; - } - } +// Transferred to ModelMarkov::setBounds for ModelSet compatibility +// for (int i = 0; i < num_states; i++) { +// if (state_freq[i] > state_freq[highest_freq_state]) { +// highest_freq_state = i; +// } +// } // by BFGS algorithm setVariables(variables); @@ -1737,37 +1741,33 @@ void ModelMarkov::readRates(string str) noexcept(false) { } void ModelMarkov::readStateFreq(istream &in) noexcept(false) { - int i; - for (i = 0; i < num_states; i++) { - string tmp_value; - in >> tmp_value; - if (tmp_value.length() == 0) - throw "State frequencies could not be read"; - state_freq[i] = convert_double_with_distribution(tmp_value.c_str(), true); + for (int i = 0; i < num_states; i++) { + string tmp_value; + in >> tmp_value; + if (tmp_value.length() == 0) + throw "State frequencies could not be read"; + state_freq[i] = convert_double_with_distribution(tmp_value.c_str(), true); if (state_freq[i] < 0.0) throw "Negative state frequencies found"; } double sum = 0.0; - for (i = 0; i < num_states; i++) sum += state_freq[i]; - if (fabs(sum-1.0) >= 1e-7) - { - outWarning("Normalizing state frequencies so that sum of them equals to 1"); - sum = 1.0/sum; - for (i = 0; i < num_states; i++) - state_freq[i] *= sum; - } + for (int i = 0; i < num_states; i++) sum += state_freq[i]; + if (fabs(sum) <= 1e-5) + outError("Sum of all state frequencies must be greater than zero!"); + if (fabs(sum-1.0) >= 1e-7) { + if (verbose_mode >= VB_MIN) + outWarning("Normalizing state frequencies so that sum of them equals to 1"); + for (int i = 0; i < num_states; i++) state_freq[i] /= sum; + } } void ModelMarkov::readStateFreq(string str) noexcept(false) { int i; int end_pos = 0; - - // detect the seperator - char separator = ','; - if (str.find('/') != std::string::npos) - separator = '/'; - - for (i = 0; i < num_states; i++) { + // detect the seperator + char separator = ','; + if (str.find('/') != std::string::npos) separator = '/'; + for (int i = 0; i < num_states; i++) { int new_end_pos; state_freq[i] = convert_double_with_distribution(str.substr(end_pos).c_str(), new_end_pos, true, separator); end_pos += new_end_pos; @@ -1779,27 +1779,26 @@ void ModelMarkov::readStateFreq(string str) noexcept(false) { if (end_pos < str.length() && str[end_pos] != ',' && str[end_pos] != ' ' && str[end_pos] != '/') outError("Comma/Space/Forward slash to separate state frequencies not found in ", str); end_pos++; - if (i < num_states - 1 && end_pos >= str.length()) - outError("The number of frequencies ("+convertIntToString(i+1)+") is less than the number of states ("+convertIntToString(num_states)+"). Please check and try again."); + if (i < num_states - 1 && end_pos >= str.length()) + outError("The number of frequencies ("+convertIntToString(i+1)+") is less than the number of states ("+convertIntToString(num_states)+"). Please check and try again."); } double sum = 0.0; - for (i = 0; i < num_states; i++) sum += state_freq[i]; - if (fabs(sum) <= 1e-5) - outError("Sum of all state frequencies must be greater than zero!"); - if (fabs(sum-1.0) >= 1e-7) - { - outWarning("Normalizing State frequencies so that sum of them equals to 1"); - sum = 1.0/sum; - for (i = 0; i < num_states; i++) - state_freq[i] *= sum; - } + for (int i = 0; i < num_states; i++) sum += state_freq[i]; + if (fabs(sum) <= 1e-5) + outError("Sum of all state frequencies must be greater than zero!"); + if (fabs(sum-1.0) >= 1e-7) { + if (verbose_mode >= VB_MIN) + outWarning("Normalizing state frequencies so that sum of them equals to 1"); + for (int i = 0; i < num_states; i++) state_freq[i] /= sum; + } } void ModelMarkov::readParameters(const char *file_name, bool adapt_tree) { if (!fileExists(file_name)) outError("File not found ", file_name); - cout << "Reading model parameters from file " << file_name << endl; + if (verbose_mode >= VB_MIN) + cout << "Reading model parameters from file " << file_name << endl; // if detect if reading full matrix or half matrix by the first entry try { diff --git a/model/modelmarkov.h b/model/modelmarkov.h index 823ffe66b..8ca48fc05 100644 --- a/model/modelmarkov.h +++ b/model/modelmarkov.h @@ -263,7 +263,7 @@ class ModelMarkov : public ModelSubst, public EigenDecomposition rescale the state frequencies @param sum_one TRUE to make frequencies sum to 1, FALSE to make last entry equal to 1 */ - void scaleStateFreq(bool sum_one); + virtual void scaleStateFreq(bool sum_one); /** get frequency type diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp index de8772e39..796f95911 100644 --- a/model/modelmixture.cpp +++ b/model/modelmixture.cpp @@ -5,18 +5,9 @@ * Author: minh */ -#include "modelmarkov.h" -#include "modeldna.h" -#include "modeldnaerror.h" -#include "modelprotein.h" -#include "modelbin.h" -#include "modelcodon.h" -#include "modelmorphology.h" -#include "modelset.h" #include "modelmixture.h" -#include "modelpomo.h" +#include "modelfactory.h" //#include "phylokernelmixture.h" -#include "modelpomomixture.h" using namespace std; @@ -1036,154 +1027,6 @@ const double MAX_MIXTURE_PROP = 1000.0; //const double MIN_MIXTURE_RATE = 0.01; //const double MAX_MIXTURE_RATE = 100.0; -ModelSubst* createModel(string model_str, ModelsBlock *models_block, - StateFreqType freq_type, string freq_params, - PhyloTree* tree) -{ - ModelSubst *model = NULL; - //cout << "Numstates: " << tree->aln->num_states << endl; - string model_params; - NxsModel *nxsmodel = models_block->findModel(model_str); - if (nxsmodel) model_params = nxsmodel->description; - - // PoMo. - bool pomo = false; - string pomo_rate_str = ""; - string pomo_heterozygosity = ""; - string::size_type p_pos = posPOMO(model_str); - pomo = (p_pos != string::npos); - - if (pomo) { - if (model_str[p_pos+2] == '{') { - string::size_type close_bracket = model_str.find("}", p_pos); - if (close_bracket == string::npos) { - cout << "Model string: " << model_str << endl; - outError("No closing bracket in PoMo parameters."); - } - else { - pomo_heterozygosity = model_str.substr(p_pos+3,close_bracket-p_pos-3); - model_str = model_str.substr(0, p_pos) + model_str.substr(close_bracket+1); - } - } - else { - model_str = model_str.substr(0, p_pos) + model_str.substr(p_pos + 2); - } - - size_t pomo_rate_start_pos; - if ((pomo_rate_start_pos = model_str.find("+G")) != string::npos) { - size_t pomo_rate_end_pos = model_str.find_first_of("+*", pomo_rate_start_pos+1); - if (pomo_rate_end_pos == string::npos) { - pomo_rate_str = model_str.substr(pomo_rate_start_pos, model_str.length() - pomo_rate_start_pos); - model_str = model_str.substr(0, pomo_rate_start_pos); - } else { - pomo_rate_str = model_str.substr(pomo_rate_start_pos, pomo_rate_end_pos - pomo_rate_start_pos); - model_str = model_str.substr(0, pomo_rate_start_pos) + model_str.substr(pomo_rate_end_pos); - } - } - } - - // sequencing error model - string seqerr = ""; - string::size_type spec_pos; - while ((spec_pos = model_str.find("+E")) != string::npos) { - string::size_type end_pos = model_str.find_first_of("+*", spec_pos+1); - if (end_pos == string::npos) { - seqerr = model_str.substr(spec_pos); - model_str = model_str.substr(0, spec_pos); - } else { - seqerr = model_str.substr(spec_pos, end_pos - spec_pos); - model_str = model_str.substr(0, spec_pos) + model_str.substr(end_pos); - } - } - - if (!seqerr.empty() && tree->aln->seq_type != SEQ_DNA) { - outError("Sequencing error model " + seqerr + " is only supported for DNA"); - } - // Now that PoMo stuff has been removed, check for model parameters. - size_t pos = model_str.find(OPEN_BRACKET); - if (pos != string::npos) { - if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) - outError("Close bracket not found at the end of ", model_str); - string tmp_str = model_str; - // extract model_name - model_str = model_str.substr(0, pos); - - // extract model params - size_t end_pos = tmp_str.find(CLOSE_BRACKET); - - // handle cases that user doesn't specify model parameters but supply state frequencies - size_t pos_plus = model_str.find('+'); - if (pos_plus != string::npos) - { - model_params = ""; - model_str = model_str.substr(0, pos_plus); - } - else - model_params = tmp_str.substr(pos+1, end_pos-pos-1); - - // extract freqs (if specified) - pos = tmp_str.find("+FQ"); - if (pos != string::npos) - freq_type = FREQ_EQUAL; - pos = tmp_str.find("+F{"); - if (pos != string::npos) - { - freq_type = FREQ_USER_DEFINED; - tmp_str = tmp_str.substr(pos+3, tmp_str.length()-pos-3); - end_pos = tmp_str.find(CLOSE_BRACKET); - freq_params = tmp_str.substr(0, end_pos); - } - } - - /* - if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || - (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || - (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || - (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || - (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) - { - model = new ModelSubst(tree->aln->num_states); - } else */ - - if ((pomo) || (tree->aln->seq_type == SEQ_POMO)) { - if (pomo_rate_str == "") - model = new ModelPoMo(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity); - else - model = new ModelPoMoMixture(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity, pomo_rate_str); - if (model->isMixture() && verbose_mode >= VB_MED) - cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; -// else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || -// (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || -// (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { -// model = new ModelGTR(tree, count_rates); -// if (freq_params != "") -// ((ModelGTR*)model)->readStateFreq(freq_params); -// if (model_params != "") -// ((ModelGTR*)model)->readRates(model_params); -// ((ModelGTR*)model)->init(freq_type); -// } else - } else if (ModelMarkov::validModelName(model_str)) { - model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); - } else if (tree->aln->seq_type == SEQ_BINARY) { - model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_DNA) { - if (seqerr.empty()) - model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); - else - model = new ModelDNAError(model_str.c_str(), model_params, freq_type, freq_params, seqerr, tree); - } else if (tree->aln->seq_type == SEQ_PROTEIN) { - model = new ModelProtein(model_str.c_str(), model_params, freq_type, freq_params, tree, models_block); - } else if (tree->aln->seq_type == SEQ_CODON) { - model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_MORPH) { - model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else { - outError("Unsupported model type"); - } - - return model; -} - /** constructor @param tree associated tree for the model diff --git a/model/modelmixture.h b/model/modelmixture.h index d564d146e..a9982640d 100644 --- a/model/modelmixture.h +++ b/model/modelmixture.h @@ -8,35 +8,18 @@ #ifndef MODELMIXTURE_H_ #define MODELMIXTURE_H_ -#include "tree/phylotree.h" -#include "modelsubst.h" #include "modelmarkov.h" #include "nclextra/modelsblock.h" - +#include "tree/phylotree.h" extern const char* builtin_mixmodels_definition; -/** - * create a substitution model - * @param model_str model nme - * @param freq_type state frequency type - * @param freq_params frequency parameters - * @param seqerr sequencing error model - * @param tree associated phylo tree - * @param count_rates TRUE to assign rates counted from alignment, FALSE to not initialize rates - * @return substitution model created - */ -ModelSubst *createModel(string model_str, ModelsBlock *models_block, - StateFreqType freq_type, string freq_params, - PhyloTree *tree); - /** * mixture model */ class ModelMixture: virtual public ModelMarkov, public vector { public: - /** constructor @param model_name model name, e.g., JC, HKY. diff --git a/model/modelset.cpp b/model/modelset.cpp index 74778e258..8ad2b5859 100644 --- a/model/modelset.cpp +++ b/model/modelset.cpp @@ -18,12 +18,288 @@ #include "modelset.h" +#include "modelfactory.h" -ModelSet::ModelSet(const char *model_name, PhyloTree *tree) : ModelMarkov(tree) +ModelSet::ModelSet(const string model_name, ModelsBlock *models_block, + StateFreqType freq, string freq_params, PhyloTree *tree) + : ModelMarkov(tree) { name = full_name = model_name; - name += "+SSF"; - full_name += "+site-specific state-frequency model (unpublished)"; + full_name += "+site-specific state frequency or rate model (unpublished)"; + // init the wrapper model to use its eigen + ModelMarkov::init(FREQ_EMPIRICAL); // +F is used here to calculate +I under SSF + ModelMarkov::fixParameters(true); // yet otherwise the wrapper model parameters remain unused + // init submodels + ASSERT(freq != FREQ_MIXTURE); + if (isSSF()) { // default freqs for unspecified sites under SSF + freq = FREQ_EQUAL; + freq_params = ""; + } + double *state_freqs = new double[num_states]; + double *rate_mat = new double[getNumRateEntries()]; + for (size_t ptn = 0; ptn < phylo_tree->aln->getNPattern(); ptn++) { + ModelMarkov *submodel; + if (ptn == 0) { // the front submodel, other submodels will take after it + submodel = (ModelMarkov*)createModel(model_name, models_block, freq, freq_params, tree); + if (!submodel->isReversible()) + outError("Non-reversible models are incompatible with site-specific models"); + freq = submodel->getFreqType(); + submodel->getStateFrequency(state_freqs); + submodel->getRateMatrix(rate_mat); + } else { + submodel = (ModelMarkov*)createModel(model_name, models_block, FREQ_EQUAL, "", tree); + submodel->setFreqType(freq); + submodel->setStateFrequency(state_freqs); + submodel->setRateMatrix(rate_mat); + } + // set site-specific freqs (if any) and re-init + if (isSSF()) { + if (phylo_tree->aln->ptn_state_freq[ptn]) { + submodel->setStateFrequency(phylo_tree->aln->ptn_state_freq[ptn]); + } // else: unspecified site, continue with the default equal freqs + submodel->init(FREQ_USER_DEFINED); + } + push_back(submodel); + } + delete [] state_freqs; + delete [] rate_mat; + // normalize site-specific rates (if any) and rescale the tree + if (isSSR()) { + double mean_rate = phylo_tree->aln->normalizePtnRateScaler(); + if (mean_rate != 1.0) { + phylo_tree->scaleLength(mean_rate); + phylo_tree->clearAllPartialLH(); + } + } + // generate the site-specific eigen of the wrapper model + joinEigenMemory(); + decomposeRateMatrix(); +} + +ModelSet::~ModelSet() +{ + for (reverse_iterator rit = rbegin(); rit != rend(); rit++) { + (*rit)->eigenvalues = nullptr; + (*rit)->eigenvectors = nullptr; + (*rit)->inv_eigenvectors = nullptr; + (*rit)->inv_eigenvectors_transposed = nullptr; + delete (*rit); + } +} + +void ModelSet::setCheckpoint(Checkpoint *checkpoint) +{ + CheckpointFactory::setCheckpoint(checkpoint); + front()->setCheckpoint(checkpoint); +} + +void ModelSet::startCheckpoint() +{ + checkpoint->startStruct("ModelSet"); +} + +void ModelSet::saveCheckpoint() +{ + startCheckpoint(); + checkpoint->startStruct("frontModel"); + front()->saveCheckpoint(); + checkpoint->endStruct(); + endCheckpoint(); +} + +void ModelSet::restoreCheckpoint() +{ + startCheckpoint(); + checkpoint->startStruct("frontModel"); + front()->restoreCheckpoint(); + checkpoint->endStruct(); + endCheckpoint(); + if (getNDim() > 0) { + double *state_freqs = new double[num_states]; + double *rate_mat = new double[getNumRateEntries()]; + getStateFrequency(state_freqs); + getRateMatrix(rate_mat); + for (iterator it = begin(); it != end(); it++) { + if (getFreqType() == FREQ_ESTIMATE) + (*it)->setStateFrequency(state_freqs); + (*it)->setRateMatrix(rate_mat); + } + delete [] state_freqs; + delete [] rate_mat; + } + decomposeRateMatrix(); + if (phylo_tree) + phylo_tree->clearAllPartialLH(); +} + +string ModelSet::getName() +{ + if (isSSF()) { + return name + "+SSF"; + } else { + return front()->getName(); + } +} + +string ModelSet::getNameParams(bool show_fixed_params) +{ + ostringstream retname; + retname << name; + if (!front()->fixed_parameters && front()->num_params > 0) { + double *rate_mat = front()->rates; + retname << '{'; + int nrates = getNumRateEntries(); + for (int i = 0; i < nrates; i++) { + if (i > 0) retname << ','; + retname << rate_mat[i]; + } + retname << '}'; + } + if (isSSF()) { + retname << "+SSF"; + } else { + front()->getNameParamsFreq(retname); + } + return retname.str(); +} + +void ModelSet::writeInfo(ostream& out) +{ + if (empty()) return; + if (verbose_mode >= VB_DEBUG) { + out << "Wrapper model:" << endl; + ModelMarkov::writeInfo(out); + int i = 1; + for (iterator it = begin(); it != end(); it++, i++) { + out << "Partition " << i << ":" << endl; + (*it)->writeInfo(out); + } + } else { + front()->writeInfo(out); + } +} + +void ModelSet::getRateMatrix(double *rate_mat) { + front()->getRateMatrix(rate_mat); +} + +void ModelSet::getStateFrequency(double *state_freqs, int mixture) +{ + if (isSSF()) { // get the +F freqs under SSF + ModelMarkov::getStateFrequency(state_freqs); + } else { + front()->getStateFrequency(state_freqs); + } +} + +void ModelSet::getQMatrix(double *q_mat, int mixture) { + if (isSSF()) { // get the +F derived QMatrix under SSF + ModelMarkov::getQMatrix(q_mat); + } else { + front()->getQMatrix(q_mat); + } +} + +StateFreqType ModelSet::getFreqType() +{ + if (isSSF()) { // get the +F type under SSF + return ModelMarkov::getFreqType(); + } else { + return front()->getFreqType(); + } +} + +int ModelSet::getNDim() +{ + return front()->getNDim(); +} + +int ModelSet::getNDimFreq() +{ + return front()->getNDimFreq(); +} + +bool ModelSet::isUnstableParameters() +{ + return front()->isUnstableParameters(); +} + +void ModelSet::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) +{ + front()->setBounds(lower_bound, upper_bound, bound_check); +} + +void ModelSet::setVariables(double* variables) +{ + front()->setVariables(variables); +} + +bool ModelSet::getVariables(double* variables) +{ + bool changed = false; + for (iterator it = begin(); it != end(); it++) { + changed |= (*it)->getVariables(variables); + } + return changed; +} + +void ModelSet::scaleStateFreq(bool sum_one) +{ + for (iterator it = begin(); it != end(); it++) { + (*it)->scaleStateFreq(sum_one); + } +} + +double ModelSet::optimizeParameters(double gradient_epsilon) +{ + if (verbose_mode >= VB_MAX) + cout << "Optimizing " << getName() << " model parameters..." << endl; + int ndim = getNDim(); + if (ndim == 0) return 0.0; // return if nothing to be optimized + double score; + // optimize with the BFGS algorithm + double *variables = new double[ndim+1]; + double *upper_bound = new double[ndim+1]; + double *lower_bound = new double[ndim+1]; + bool *bound_check = new bool[ndim+1]; + setVariables(variables); + setBounds(lower_bound, upper_bound, bound_check); + score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE)); + bool changed = getVariables(variables); + delete [] bound_check; + delete [] lower_bound; + delete [] upper_bound; + delete [] variables; + // BQM 2015-09-07: normalize state_freq + if (is_reversible && getFreqType() == FREQ_ESTIMATE) { + scaleStateFreq(true); + changed = true; + } + if (changed || score == -1.0e+30) { + decomposeRateMatrix(); + phylo_tree->clearAllPartialLH(); + score = phylo_tree->computeLikelihood(); + } + return score; +} + +double ModelSet::targetFunk(double x[]) +{ + bool changed = getVariables(x); + if (changed) { + decomposeRateMatrix(); + ASSERT(phylo_tree); + phylo_tree->clearAllPartialLH(); + } + // avoid numerical issue if state_freq is too small + double *state_freqs = (isSSF()) ? this->state_freq : front()->state_freq; + for (int x = 0; x < num_states; x++) { + if (state_freqs[x] < 0 || (state_freqs[x] >= 0 && + state_freqs[x] < Params::getInstance().min_state_freq)) { + return 1.0e+30; + } + } + return -phylo_tree->computeLikelihood(); } void ModelSet::computeTransMatrix(double time, double* trans_matrix, int mixture, int selected_row) @@ -50,51 +326,49 @@ void ModelSet::computeTransDerv(double time, double* trans_matrix, double* trans int ModelSet::getPtnModelID(int ptn) { - ASSERT(ptn >= 0 && ptn < pattern_model_map.size()); - ASSERT(pattern_model_map[ptn] >= 0 && pattern_model_map[ptn] < size()); - return pattern_model_map[ptn]; + ASSERT(ptn >= 0 && ptn < size()); + return ptn; } - -double ModelSet::computeTrans(double time, int model_id, int state1, int state2) { - if (phylo_tree->vector_size == 1) { - return at(model_id)->computeTrans(time, state1, state2); - } +double ModelSet::computeTrans(double time, int model_id, int state1, int state2) +{ + if (phylo_tree->vector_size == 1) { + return at(model_id)->computeTrans(time, state1, state2); + } // temporary fix problem with vectorized eigenvectors - int i; - int vsize = phylo_tree->vector_size; - int states_vsize = num_states*vsize; - int model_vec_id = model_id % vsize; - int start_ptn = model_id - model_vec_id; - double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; - double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; - double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; + int vsize = phylo_tree->vector_size; + int states_vsize = num_states*vsize; + int model_vec_id = model_id % vsize; + int start_ptn = model_id - model_vec_id; + double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; + double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; + double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; double trans_prob = 0.0; - for (i = 0; i < states_vsize; i+=vsize) { - double val = eval[i]; + for (int i = 0; i < states_vsize; i+=vsize) { + double val = eval[i]; double trans = evec[i] * inv_evec[i*num_states] * exp(time * val); trans_prob += trans; } return trans_prob; } -double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) { - if (phylo_tree->vector_size == 1) { - return at(model_id)->computeTrans(time, state1, state2, derv1, derv2); - } +double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) +{ + if (phylo_tree->vector_size == 1) { + return at(model_id)->computeTrans(time, state1, state2, derv1, derv2); + } // temporary fix problem with vectorized eigenvectors - int i; - int vsize = phylo_tree->vector_size; - int states_vsize = num_states*vsize; - int model_vec_id = model_id % vsize; - int start_ptn = model_id - model_vec_id; - double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; - double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; - double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; + int vsize = phylo_tree->vector_size; + int states_vsize = num_states*vsize; + int model_vec_id = model_id % vsize; + int start_ptn = model_id - model_vec_id; + double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; + double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; + double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; double trans_prob = 0.0; derv1 = derv2 = 0.0; - for (i = 0; i < states_vsize; i+=vsize) { - double val = eval[i]; + for (int i = 0; i < states_vsize; i+=vsize) { + double val = eval[i]; double trans = evec[i] * inv_evec[i*num_states] * exp(time * val); double trans2 = trans * val; trans_prob += trans; @@ -104,146 +378,111 @@ double ModelSet::computeTrans(double time, int model_id, int state1, int state2, return trans_prob; } -int ModelSet::getNDim() +uint64_t ModelSet::getMemoryRequired() { - ASSERT(size()); - return front()->getNDim(); -} - -void ModelSet::writeInfo(ostream& out) -{ - if (empty()) { - return; - } - if (verbose_mode >= VB_DEBUG) { - int i = 1; - for (iterator it = begin(); it != end(); it++, i++) { - out << "Partition " << i << ":" << endl; - (*it)->writeInfo(out); - } - } else { - front()->writeInfo(out); + uint64_t mem = ModelMarkov::getMemoryRequired(); + for (iterator it = begin(); it != end(); it++) { + mem += (*it)->getMemoryRequired(); } + return mem; } void ModelSet::decomposeRateMatrix() { - if (empty()) { - return; - } - for (iterator it = begin(); it != end(); it++) { - (*it)->decomposeRateMatrix(); - } - if (phylo_tree->vector_size == 1) + if (empty()) return; + // decompose for each submodel + for (iterator it = begin(); it != end(); it++) { + (*it)->decomposeRateMatrix(); + } + // set site-specific rates (if any) + if (isSSR()) { // multiply eigenvalues of each submodel by the submodel rate scaler + for (size_t ptn = 0; ptn < size(); ptn++) { + ASSERT(phylo_tree->aln->ptn_rate_scaler[ptn]); + double rate_scaler = phylo_tree->aln->ptn_rate_scaler[ptn]; + double *eval_ptr = &eigenvalues[ptn*num_states]; + for (size_t x = 0; x < num_states; x++) { + eval_ptr[x] *= rate_scaler; + } + } + } + if (phylo_tree->vector_size == 1) { return; - // rearrange eigen to obey vector_size + } + // else rearrange eigen to obey vector_size size_t vsize = phylo_tree->vector_size; size_t states2 = num_states*num_states; - - size_t max_size = get_safe_upper_limit(size()); - - // copy dummy values - for (size_t m = size(); m < max_size; m++) { - memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); - memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); - } - - double new_eval[num_states*vsize]; - double new_evec[states2*vsize]; - double new_inv_evec[states2*vsize]; - - for (size_t ptn = 0; ptn < size(); ptn += vsize) { - double *eval_ptr = &eigenvalues[ptn*num_states]; - double *evec_ptr = &eigenvectors[ptn*states2]; - double *inv_evec_ptr = &inv_eigenvectors[ptn*states2]; - for (size_t i = 0; i < vsize; i++) { - for (size_t x = 0; x < num_states; x++) - new_eval[x*vsize+i] = eval_ptr[x]; - for (size_t x = 0; x < states2; x++) { - new_evec[x*vsize+i] = evec_ptr[x]; - new_inv_evec[x*vsize+i] = inv_evec_ptr[x]; - } - eval_ptr += num_states; - evec_ptr += states2; - inv_evec_ptr += states2; - } - // copy new values - memcpy(&eigenvalues[ptn*num_states], new_eval, sizeof(double)*num_states*vsize); - memcpy(&eigenvectors[ptn*states2], new_evec, sizeof(double)*states2*vsize); - memcpy(&inv_eigenvectors[ptn*states2], new_inv_evec, sizeof(double)*states2*vsize); - calculateSquareMatrixTranspose(new_inv_evec, num_states - , &inv_eigenvectors_transposed[ptn*states2]); - } -} - -bool ModelSet::getVariables(double* variables) -{ - ASSERT(size()); - bool changed = false; - for (iterator it = begin(); it != end(); it++) - changed |= (*it)->getVariables(variables); - return changed; -} - -void ModelSet::setVariables(double* variables) -{ - ASSERT(size()); - front()->setVariables(variables); + // copy dummy values + size_t max_size = get_safe_upper_limit(size()); + for (size_t m = size(); m < max_size; m++) { + memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); + memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); + } + // make rearranged eigen for each submodel + double new_eval[num_states*vsize]; + double new_evec[states2*vsize]; + double new_inv_evec[states2*vsize]; + for (size_t ptn = 0; ptn < size(); ptn += vsize) { + double *eval_ptr = &eigenvalues[ptn*num_states]; + double *evec_ptr = &eigenvectors[ptn*states2]; + double *inv_evec_ptr = &inv_eigenvectors[ptn*states2]; + for (size_t i = 0; i < vsize; i++) { + for (size_t x = 0; x < num_states; x++) { + new_eval[x*vsize+i] = eval_ptr[x]; + } + for (size_t x = 0; x < states2; x++) { + new_evec[x*vsize+i] = evec_ptr[x]; + new_inv_evec[x*vsize+i] = inv_evec_ptr[x]; + } + eval_ptr += num_states; + evec_ptr += states2; + inv_evec_ptr += states2; + } + // copy new values + memcpy(&eigenvalues[ptn*num_states], new_eval, sizeof(double)*num_states*vsize); + memcpy(&eigenvectors[ptn*states2], new_evec, sizeof(double)*states2*vsize); + memcpy(&inv_eigenvectors[ptn*states2], new_inv_evec, sizeof(double)*states2*vsize); + calculateSquareMatrixTranspose(new_inv_evec, num_states, &inv_eigenvectors_transposed[ptn*states2]); + } } -ModelSet::~ModelSet() +void ModelSet::joinEigenMemory() { - for (reverse_iterator rit = rbegin(); rit != rend(); rit++) { - (*rit)->eigenvalues = nullptr; - (*rit)->eigenvectors = nullptr; - (*rit)->inv_eigenvectors = nullptr; - (*rit)->inv_eigenvectors_transposed = nullptr; - delete (*rit); - } -} - -void ModelSet::joinEigenMemory() { - size_t nmixtures = get_safe_upper_limit(size()); - aligned_free(eigenvalues); - aligned_free(eigenvectors); - aligned_free(inv_eigenvectors); - aligned_free(inv_eigenvectors_transposed); - - size_t states2 = num_states*num_states; - - eigenvalues = aligned_alloc(num_states*nmixtures); - eigenvectors = aligned_alloc(states2*nmixtures); - inv_eigenvectors = aligned_alloc(states2*nmixtures); - inv_eigenvectors_transposed = aligned_alloc(states2*nmixtures); - - // assigning memory for individual models - size_t m = 0; - for (iterator it = begin(); it != end(); it++, m++) { - // first copy memory for eigen stuffs - memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double)); - memcpy(&eigenvectors[m*states2], (*it)->eigenvectors, states2*sizeof(double)); - memcpy(&inv_eigenvectors[m*states2], (*it)->inv_eigenvectors, states2*sizeof(double)); - memcpy(&inv_eigenvectors_transposed[m*states2], (*it)->inv_eigenvectors_transposed, states2*sizeof(double)); - // then delete - aligned_free((*it)->eigenvalues); - aligned_free((*it)->eigenvectors); - aligned_free((*it)->inv_eigenvectors); - aligned_free((*it)->inv_eigenvectors_transposed); - - // and assign new memory - (*it)->eigenvalues = &eigenvalues[m*num_states]; - (*it)->eigenvectors = &eigenvectors[m*states2]; - (*it)->inv_eigenvectors = &inv_eigenvectors[m*states2]; - (*it)->inv_eigenvectors_transposed = &inv_eigenvectors_transposed[m*states2]; - } - - // copy dummy values - for (m = size(); m < nmixtures; m++) { - memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); - memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); - } + size_t states2 = num_states*num_states; + size_t nmixtures = get_safe_upper_limit(size()); + aligned_free(eigenvalues); + aligned_free(eigenvectors); + aligned_free(inv_eigenvectors); + aligned_free(inv_eigenvectors_transposed); + eigenvalues = aligned_alloc(num_states*nmixtures); + eigenvectors = aligned_alloc(states2*nmixtures); + inv_eigenvectors = aligned_alloc(states2*nmixtures); + inv_eigenvectors_transposed = aligned_alloc(states2*nmixtures); + // assigning memory for individual models + size_t m = 0; + for (iterator it = begin(); it != end(); it++, m++) { + // first copy memory for eigen stuffs + memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double)); + memcpy(&eigenvectors[m*states2], (*it)->eigenvectors, states2*sizeof(double)); + memcpy(&inv_eigenvectors[m*states2], (*it)->inv_eigenvectors, states2*sizeof(double)); + memcpy(&inv_eigenvectors_transposed[m*states2], (*it)->inv_eigenvectors_transposed, states2*sizeof(double)); + // then delete + aligned_free((*it)->eigenvalues); + aligned_free((*it)->eigenvectors); + aligned_free((*it)->inv_eigenvectors); + aligned_free((*it)->inv_eigenvectors_transposed); + // and assign new memory + (*it)->eigenvalues = &eigenvalues[m*num_states]; + (*it)->eigenvectors = &eigenvectors[m*states2]; + (*it)->inv_eigenvectors = &inv_eigenvectors[m*states2]; + (*it)->inv_eigenvectors_transposed = &inv_eigenvectors_transposed[m*states2]; + } + // copy dummy values + for (size_t m = size(); m < nmixtures; m++) { + memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); + memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); + } } diff --git a/model/modelset.h b/model/modelset.h index bc45d0408..e82d53eeb 100644 --- a/model/modelset.h +++ b/model/modelset.h @@ -23,47 +23,162 @@ #include "modelmarkov.h" /** - * a set of substitution models, used eg for site-specific state frequency model or - * partition model with joint branch lengths + * A set of substitution models to implement site-specific models */ class ModelSet : public ModelMarkov, public vector { public: - ModelSet(const char *model_name, PhyloTree *tree); + ModelSet(const string model_name, ModelsBlock *models_block, + StateFreqType freq, string freq_params, PhyloTree *tree); + + virtual ~ModelSet(); + + /** + set checkpoint object + @param checkpoint + */ + virtual void setCheckpoint(Checkpoint *checkpoint); + + /** + start structure for checkpointing + */ + virtual void startCheckpoint(); + + /** + save object into the checkpoint + */ + virtual void saveCheckpoint(); + + /** + restore object from the checkpoint + */ + virtual void restoreCheckpoint(); + /** * @return TRUE if this is a site-specific model, FALSE otherwise */ virtual bool isSiteSpecificModel() { return true; } + /** + * @return TRUE if the model has site-specific frequencies, FALSE otherwise + */ + virtual bool isSSF() { return phylo_tree->aln->isSSF(); } + + /** + * @return TRUE if the model has site-specific rates, FALSE otherwise + */ + virtual bool isSSR() { return phylo_tree->aln->isSSR(); } + /** * get the size of transition matrix, default is num_states*num_states. * can be changed for e.g. site-specific model */ virtual int getTransMatrixSize() { return num_states * num_states * size(); } + /** + * @return model name + */ + virtual string getName(); + + /** + * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f} + */ + virtual string getNameParams(bool show_fixed_params = false); + + /** + write information + @param out output stream + */ + virtual void writeInfo(ostream &out); + + /** + get the rate matrix. + @param rate_mat (OUT) upper-triagle rate matrix. + Assume rate_mat has size of num_states*(num_states-1)/2 + */ + virtual void getRateMatrix(double *rate_mat); + + /** + compute the state frequency vector. One should override this function when defining new model. + The default is equal state sequency, valid for all kind of data. + @param mixture (optional) class for mixture model + @param[out] state_freqs state frequency vector. Assume state_freqs has size of num_states + */ + virtual void getStateFrequency(double *state_freqs, int mixture = 0); + + /** + compute Q matrix + @param q_mat (OUT) Q matrix, assuming of size num_states * num_states + */ + virtual void getQMatrix(double *q_mat, int mixture = 0); + + /** + get frequency type + @return frequency type + */ + virtual StateFreqType getFreqType(); + + /** + @return the number of dimensions + */ + virtual int getNDim(); + + /** + @return the number of dimensions corresponding to state frequencies, which is + not counted in getNDim(). This serves e.g. for computing AIC, BIC score + */ + virtual int getNDimFreq(); + + /** + @return TRUE if parameters are at the boundary that may cause numerical unstability + */ + virtual bool isUnstableParameters(); + + /** + setup the bounds for joint optimization with BFGS + */ + virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check); + + /** + rescale the state frequencies + @param sum_one TRUE to make frequencies sum to 1, FALSE to make last entry equal to 1 + */ + virtual void scaleStateFreq(bool sum_one); + + /** + optimize model parameters + @return the best likelihood + */ + virtual double optimizeParameters(double gradient_epsilon); + + /** + the target function which needs to be optimized + @param x the input vector x + @return the function value at x + */ + virtual double targetFunk(double x[]); /** compute the transition probability matrix. @param time time between two events - @param mixture (optional) class for mixture model - @param selected_row (optional) only compute the entries of one selected row. By default, compute all rows + @param mixture (optional) class for mixture model + @param selected_row (optional) only compute the entries of one selected row. By default, compute all rows @param trans_matrix (OUT) the transition matrix between all pairs of states. - Assume trans_matrix has size of num_states * num_states. + Assume trans_matrix has size of num_states * num_states. */ virtual void computeTransMatrix(double time, double *trans_matrix, int mixture = 0, int selected_row = -1); - /** compute the transition probability matrix.and the derivative 1 and 2 @param time time between two events - @param mixture (optional) class for mixture model + @param mixture (optional) class for mixture model @param trans_matrix (OUT) the transition matrix between all pairs of states. - Assume trans_matrix has size of num_states * num_states. - @param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. - @param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. + @param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. + @param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. + Assume trans_matrix has size of num_states * num_states. */ - virtual void computeTransDerv(double time, double *trans_matrix, + virtual void computeTransDerv(double time, double *trans_matrix, double *trans_derv1, double *trans_derv2, int mixture = 0); /** @@ -86,10 +201,8 @@ class ModelSet : public ModelMarkov, public vector */ virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2) { return 0; } - - /** - compute the transition probability between two states at a specific site + compute the transition probability between two states at a specific site One should override this function when defining new model. The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc) @param time time between two events @@ -117,65 +230,39 @@ class ModelSet : public ModelMarkov, public vector * @param ptn pattern ID of the alignment */ virtual int getPtnModelID(int ptn); - - /** - return the number of dimensions - */ - virtual int getNDim(); - /** - write information - @param out output stream + compute the memory size for the model, can be large for site-specific models + @return memory size required in bytes */ - virtual void writeInfo(ostream &out); + virtual uint64_t getMemoryRequired(); /** decompose the rate matrix into eigenvalues and eigenvectors */ virtual void decomposeRateMatrix(); - virtual ~ModelSet(); - - /** - * compute the memory size for the model, can be large for site-specific models - * @return memory size required in bytes - */ - virtual uint64_t getMemoryRequired() { - uint64_t mem = ModelMarkov::getMemoryRequired(); - for (iterator it = begin(); it != end(); it++) - mem += (*it)->getMemoryRequired(); - return mem; - } - - /** map from pattern ID to model ID */ - IntVector pattern_model_map; - - /** - join memory for eigen into one chunk - */ - void joinEigenMemory(); - protected: - - - /** - this function is served for the multi-dimension optimization. It should pack the model parameters + this function is served for the multi-dimension optimization. It should pack the model parameters into a vector that is index from 1 (NOTE: not from 0) @param variables (OUT) vector of variables, indexed from 1 */ virtual void setVariables(double *variables); /** - this function is served for the multi-dimension optimization. It should assign the model parameters + this function is served for the multi-dimension optimization. It should assign the model parameters from a vector of variables that is index from 1 (NOTE: not from 0) @param variables vector of variables, indexed from 1 @return TRUE if parameters are changed, FALSE otherwise (2015-10-20) */ virtual bool getVariables(double *variables); - + /** + join memory for eigen into one chunk + */ + void joinEigenMemory(); + }; #endif // MODELSET_H diff --git a/model/modelsubst.h b/model/modelsubst.h index 37835a8f2..d44f2c8d2 100644 --- a/model/modelsubst.h +++ b/model/modelsubst.h @@ -90,6 +90,16 @@ class ModelSubst: public Optimization, public CheckpointFactory */ virtual bool isSiteSpecificModel() { return false; } + /** + * @return TRUE if the model has site-specific frequencies, FALSE otherwise + */ + virtual bool isSSF() { return false; } + + /** + * @return TRUE if the model has site-specific rates, FALSE otherwise + */ + virtual bool isSSR() { return false; } + /** * @return TRUE if this is a mixture model, FALSE otherwise */ @@ -282,6 +292,11 @@ class ModelSubst: public Optimization, public CheckpointFactory */ virtual StateFreqType getFreqType() { return FREQ_EQUAL; } + /** + set frequency type + */ + virtual void setFreqType(StateFreqType freq) { freq_type = freq; } + /** set the associated tree @param tree the associated tree diff --git a/model/partitionmodelplen.cpp b/model/partitionmodelplen.cpp index 967efca07..d13ad6b03 100644 --- a/model/partitionmodelplen.cpp +++ b/model/partitionmodelplen.cpp @@ -188,8 +188,8 @@ double PartitionModelPlen::optimizeParameters(int fixed_len, bool write_info, do it->second->writeInfo(cout); } - cout << "Parameters optimization took " << i-1 << " rounds (" << getRealTime()-begin_time << " sec)" << endl << endl; - + cout << "Parameters optimization took " << i-1 << " rounds (" << getRealTime()-begin_time << " sec)" << endl; + return tree_lh; } diff --git a/model/rategamma.cpp b/model/rategamma.cpp index 8b339abc4..3646811a1 100644 --- a/model/rategamma.cpp +++ b/model/rategamma.cpp @@ -255,6 +255,7 @@ void RateGamma::writeParameters(ostream &out) { out << "\t" << gamma_shape; } +/* int RateGamma::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { //cout << "Computing Gamma site rates by empirical Bayes..." << endl; @@ -289,7 +290,7 @@ int RateGamma::computePatternRates(DoubleVector &pattern_rates, IntVector &patte // pattern_cat[i] = j; // delete [] ptn_rates; } - +*/ /*NUMERICAL SUBROUTINES ************************************************************************************** diff --git a/model/rategamma.h b/model/rategamma.h index 1f6e90315..1337cf734 100644 --- a/model/rategamma.h +++ b/model/rategamma.h @@ -139,7 +139,7 @@ class RateGamma: virtual public RateHeterogeneity @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); /** * setup the bounds for joint optimization with BFGS diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp index 134349dbc..4542e075f 100644 --- a/model/rategammainvar.cpp +++ b/model/rategammainvar.cpp @@ -186,7 +186,7 @@ double RateGammaInvar::optimizeParameters(double gradient_epsilon) { } } - +/* int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { //cout << "Computing Gamma site rates by empirical Bayes..." << endl; @@ -215,6 +215,7 @@ int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector & } return ncategory+1; } +*/ double RateGammaInvar::optimizeWithEM(double gradient_epsilon) { double curlh = phylo_tree->computeLikelihood(); diff --git a/model/rategammainvar.h b/model/rategammainvar.h index 67ad17bf6..762080cda 100644 --- a/model/rategammainvar.h +++ b/model/rategammainvar.h @@ -157,7 +157,7 @@ class RateGammaInvar : public RateInvar, public RateGamma @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); protected: diff --git a/model/rateheterogeneity.cpp b/model/rateheterogeneity.cpp index d162eaf1c..34ad86975 100644 --- a/model/rateheterogeneity.cpp +++ b/model/rateheterogeneity.cpp @@ -22,12 +22,17 @@ #include "rateheterogeneity.h" -RateHeterogeneity::RateHeterogeneity() - : Optimization(), CheckpointFactory() +RateHeterogeneity::RateHeterogeneity(PhyloTree *tree) + : Optimization(), CheckpointFactory() { name = ""; full_name = "Uniform"; - phylo_tree = NULL; + phylo_tree = tree; + // update name if site-specific rates are used + if (phylo_tree && phylo_tree->aln->isSSR()) { + name = "+SSR"; + full_name = "(site-specific rates)"; + } } void RateHeterogeneity::setTree(PhyloTree *tree) { diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h index d518cade4..4be23f67b 100644 --- a/model/rateheterogeneity.h +++ b/model/rateheterogeneity.h @@ -45,13 +45,13 @@ class RateHeterogeneity : public Optimization, public CheckpointFactory { friend class ModelFactory; friend class ModelPoMoMixture; - friend class IQTreeMix; + friend class IQTreeMix; public: /** constructor */ - RateHeterogeneity(); + RateHeterogeneity(PhyloTree *tree = NULL); /** destructor @@ -85,6 +85,11 @@ class RateHeterogeneity : public Optimization, public CheckpointFactory */ PhyloTree *getTree() { return phylo_tree; } + /** + @return TRUE if this is a rate mixture model + */ + bool isMixture() { return !isHeterotachy() && getNRate() > 1; } + /** * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f} */ @@ -304,7 +309,15 @@ class RateHeterogeneity : public Optimization, public CheckpointFactory @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { return 1; } + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { return 1; } + + /* + Get site-specific rates precomputed by the model of Meyer & von Haeseler (2003) + @param ptn_rate (OUT) rates per pattern + @param ptn_category (OUT) rate categories per pattern if the model is discrete + @return number of categories if the model is discrete + */ + virtual int getPatternRates(DoubleVector &ptn_rate, IntVector &ptn_category) { return 1; } /** name of the rate heterogeneity type diff --git a/model/ratekategory.cpp b/model/ratekategory.cpp index 60cdc8363..ab15e751b 100644 --- a/model/ratekategory.cpp +++ b/model/ratekategory.cpp @@ -96,6 +96,7 @@ double RateKategory::optimizeParameters(double gradient_epsilon) return score; } +/* int RateKategory::computePatternRates(DoubleVector& pattern_rates, IntVector& pattern_cat) { cout << "Computing site rates by empirical Bayes..." << endl; @@ -136,6 +137,7 @@ int RateKategory::computePatternRates(DoubleVector& pattern_rates, IntVector& pa // pattern_cat[i] = j; // delete [] ptn_rates; } +*/ bool RateKategory::getVariables(double* variables) { diff --git a/model/ratekategory.h b/model/ratekategory.h index 4289dd345..c56032fdc 100644 --- a/model/ratekategory.h +++ b/model/ratekategory.h @@ -67,7 +67,7 @@ class RateKategory : virtual public RateHeterogeneity @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); /** the target function which needs to be optimized diff --git a/model/ratemeyerdiscrete.cpp b/model/ratemeyerdiscrete.cpp index 72a8ab83c..d6788502e 100644 --- a/model/ratemeyerdiscrete.cpp +++ b/model/ratemeyerdiscrete.cpp @@ -240,11 +240,25 @@ double RateMeyerDiscrete::getPtnRate(int ptn) { return rates[ptn_cat[ptn]]; } +/* int RateMeyerDiscrete::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { pattern_rates.insert(pattern_rates.begin(), begin(), end()); pattern_cat.insert(pattern_cat.begin(), ptn_cat, ptn_cat + size()); return ncategory; } +*/ + +int RateMeyerDiscrete::getPatternRates(DoubleVector &ptn_rate, IntVector &ptn_category) { + if (!is_categorized) return RateMeyerHaeseler::getPatternRates(ptn_rate, ptn_category); + ASSERT(ptn_cat && rates); + ptn_rate.resize(size(), 1.0); + ptn_category.resize(size(), -1); + for (size_t ptn = 0; ptn < size(); ++ptn) { + ptn_rate[ptn] = rates[ptn_cat[ptn]]; + ptn_category[ptn] = ptn_cat[ptn]; + } + return ncategory; +} /*double RateMeyerDiscrete::optimizeParameters() { if (is_categorized) { diff --git a/model/ratemeyerdiscrete.h b/model/ratemeyerdiscrete.h index e5b8a4fb1..d21bf72e3 100644 --- a/model/ratemeyerdiscrete.h +++ b/model/ratemeyerdiscrete.h @@ -77,7 +77,15 @@ class RateMeyerDiscrete : public RateMeyerHaeseler @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + + /* + Get site-specific rates precomputed by the model of Meyer & von Haeseler (2003) + @param ptn_rate (OUT) rates per pattern + @param ptn_category (OUT) rate categories per pattern if the model is discrete + @return number of categories if the model is discrete + */ + virtual int getPatternRates(DoubleVector &ptn_rate, IntVector &ptn_category); virtual bool isSiteSpecificRate(); diff --git a/model/ratemeyerhaeseler.cpp b/model/ratemeyerhaeseler.cpp index d8f52bb12..ea6d17e20 100644 --- a/model/ratemeyerhaeseler.cpp +++ b/model/ratemeyerhaeseler.cpp @@ -136,10 +136,17 @@ double RateMeyerHaeseler::getPtnRate(int ptn) { return 1.0; } +/* int RateMeyerHaeseler::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { pattern_rates.insert(pattern_rates.begin(), begin(), end()); return size(); } +*/ + +int RateMeyerHaeseler::getPatternRates(DoubleVector &ptn_rate, IntVector &ptn_category) { + ptn_rate.insert(ptn_rate.begin(), begin(), end()); + return 1; +} void RateMeyerHaeseler::getRates(DoubleVector &rates) { rates.clear(); @@ -484,33 +491,31 @@ void RateMeyerHaeseler::runIterativeProc(Params ¶ms, IQTree &tree) { } setTree(&tree); RateHeterogeneity *backup_rate = tree.getRate(); - if (backup_rate->getGammaShape() > 0 ) { - IntVector pattern_cat; - backup_rate->computePatternRates(*this, pattern_cat); - double sum = 0.0; - size_t seqLen = size(); - auto freq = phylo_tree->getConvertedSequenceFrequencies(); - if (freq!=nullptr && seqLen == phylo_tree->getConvertedSequenceLength()) { - #ifdef _OPENMP - #pragma omp parallel for reduction(+:sum) - #endif - for (size_t i = 0; i < seqLen ; ++i) { - sum += at(i) * freq[i]; - } - } else { - for (size_t i = 0; i < seqLen; ++i) { - sum += at(i) * phylo_tree->aln->at(i).frequency; - } - } - sum /= phylo_tree->aln->getNSite(); + if (backup_rate->isMixture()) { + tree.computePatternRate(*this); + double sum = 0.0; + size_t seqLen = size(); + auto freq = phylo_tree->getConvertedSequenceFrequencies(); + if (freq!=nullptr && seqLen == phylo_tree->getConvertedSequenceLength()) { + #ifdef _OPENMP + #pragma omp parallel for reduction(+:sum) + #endif + for (size_t i = 0; i < seqLen ; ++i) { + sum += at(i) * freq[i]; + } + } else { + for (size_t i = 0; i < seqLen; ++i) { + sum += at(i) * phylo_tree->aln->at(i).frequency; + } + } + sum /= phylo_tree->aln->getNSite(); if (fabs(sum - 1.0) > 0.0001) { - if (verbose_mode >= VB_MED) { - cout << "Normalizing Gamma rates (" << sum << ")" << endl; - } - for (size_t i = 0; i < size(); ++i) { - at(i) /= sum; - } - } + if (verbose_mode >= VB_MED) + cout << "Normalizing site rates (mean: " << sum << ")" << endl; + for (size_t i = 0; i < size(); ++i) { + at(i) /= sum; + } + } } tree.getModelFactory()->site_rate = this; tree.setRate(this); diff --git a/model/ratemeyerhaeseler.h b/model/ratemeyerhaeseler.h index 10cf2b03b..35632b7c9 100644 --- a/model/ratemeyerhaeseler.h +++ b/model/ratemeyerhaeseler.h @@ -84,7 +84,15 @@ class RateMeyerHaeseler : public RateHeterogeneity, public DoubleVector @param pattern_rates (OUT) pattern rates. Resizing if necesary @return total number of categories */ - virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + //virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat); + + /* + Get site-specific rates precomputed by the model of Meyer & von Haeseler (2003) + @param ptn_rate (OUT) rates per pattern + @param ptn_category (OUT) rate categories per pattern if the model is discrete + @return number of categories if the model is discrete + */ + virtual int getPatternRates(DoubleVector &ptn_rate, IntVector &ptn_category); void getRates(DoubleVector &rates); diff --git a/simulator/alisimulatorheterogeneity.cpp b/simulator/alisimulatorheterogeneity.cpp index 7bd3022cb..4f6d368a9 100644 --- a/simulator/alisimulatorheterogeneity.cpp +++ b/simulator/alisimulatorheterogeneity.cpp @@ -193,26 +193,25 @@ vector AliSimulatorHeterogeneity::regenerateSequenceMixtureModel(int } /** - extract pattern- posterior mean state frequencies and posterior model probability + extract pattern-specific posterior mean state frequencies and posterior model probability */ void AliSimulatorHeterogeneity::extractPatternPosteriorFreqsAndModelProb() { - // get pattern-specific state frequencies (ptn_state_freq) int nptn = tree->aln->getNPattern(); int nmixture = tree->getModel()->getNMixtures(); if (!ptn_state_freq) { - ptn_state_freq = new double[nptn * max_num_states]; - - SiteFreqType tmp_site_freq_type = tree->params->print_site_state_freq; - tree->params->print_site_state_freq = WSF_POSTERIOR_MEAN; + // get pattern-specific state frequencies (ptn_state_freq) + SiteFreqType tmp_site_freq_type = tree->params->site_state_freq_type; + tree->params->site_state_freq_type = WSF_POSTERIOR_MEAN; + ptn_state_freq = nullptr; tree->computePatternStateFreq(ptn_state_freq); + ASSERT(ptn_state_freq); + tree->params->site_state_freq_type = tmp_site_freq_type; // get pattern-specific posterior model probability int nptn_times_nmixture = nptn * nmixture; ptn_model_dis = new double[nptn_times_nmixture]; memcpy(ptn_model_dis, tree->getPatternLhCatPointer(), nptn_times_nmixture * sizeof(double)); - tree->params->print_site_state_freq = tmp_site_freq_type; - // convert ptn_model_dis to accummulated matrix convertProMatrixIntoAccumulatedProMatrix(ptn_model_dis, nptn, nmixture); } @@ -458,12 +457,14 @@ void AliSimulatorHeterogeneity::getSiteSpecificPosteriorRateHeterogeneity(vector { int num_rates = rate_heterogeneity->getNDiscreteRate(); - // get pattern-specific rate (ptn_rate) if (pattern_rates.size() == 0) { - IntVector pattern_cat; - tree->getRate()->computePatternRates(pattern_rates, pattern_cat); - + // get pattern-specific rate (ptn_rate) + SiteRateType tmp_site_rate_type = tree->params->site_rate_type; + tree->params->site_rate_type = WSR_POSTERIOR_MEAN; + tree->computePatternRate(pattern_rates); + ASSERT(pattern_rates.size()); + tree->params->site_rate_type = tmp_site_rate_type; // extract pattern rate distribution if the user wants to sample a rate for each site from posterior distribution if (tree->params->alisim_rate_heterogeneity == POSTERIOR_DIS) { diff --git a/tree/iqtree.cpp b/tree/iqtree.cpp index 7757a146a..545f8ded1 100644 --- a/tree/iqtree.cpp +++ b/tree/iqtree.cpp @@ -545,6 +545,7 @@ void IQTree::createPLLPartition(Params ¶ms, ostream &pllPartitionFileHandle) } void IQTree::computeInitialTree(LikelihoodKernel kernel, istream* in) { + cout << endl; double start = getRealTime(); string initTree; string out_file = params->out_prefix; @@ -561,7 +562,6 @@ void IQTree::computeInitialTree(LikelihoodKernel kernel, istream* in) { setParsimonyKernel(kernel); if (in != NULL || params->user_file) { - // start the search with user-defined tree bool myrooted = params->is_rooted; if (in != NULL) { @@ -589,7 +589,7 @@ void IQTree::computeInitialTree(LikelihoodKernel kernel, istream* in) { saveCheckpoint(); } else if (CKP_RESTORE(initTree)) { readTreeString(initTree); - cout << endl << "CHECKPOINT: Initial tree restored" << endl; + cout << "CHECKPOINT: Initial tree restored" << endl; } else { START_TREE_TYPE start_tree = params->start_tree; // only own parsimony kernel supports constraint tree @@ -611,7 +611,6 @@ void IQTree::computeInitialTree(LikelihoodKernel kernel, istream* in) { break; case STT_RANDOM_TREE: case STT_PLL_PARSIMONY: - cout << endl; cout << "Create initial parsimony tree by phylogenetic likelihood library (PLL)... "; pllInst->randomNumberSeed = params->ran_seed + MPIHelper::getInstance().getProcessID(); pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist); @@ -803,7 +802,7 @@ void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) { candidateTrees.clear(); if (init_size < initTreeStrings.size()) - cout << "Computing log-likelihood of " << initTreeStrings.size() - init_size << " initial trees ... "; + cout << "Computing log-likelihood of " << initTreeStrings.size() - init_size << " initial trees... "; startTime = getRealTime(); for (vector::iterator it = initTreeStrings.begin(); it != initTreeStrings.end(); ++it) { @@ -2679,20 +2678,22 @@ void IQTree::refineBootTrees() { if (CKP_RESTORE(refined_samples)) cout << "CHECKPOINT: " << refined_samples << " refined samples restored" << endl; checkpoint->endStruct(); - + // 2018-08-17: delete duplicated memory deleteAllPartialLh(); ModelsBlock *models_block = readModelsDefinition(*params); - - // do bootstrap analysis - for (int sample = refined_samples; sample < boot_trees.size(); sample++) { + + // do bootstrap analysis + for (int sample = refined_samples; sample < boot_trees.size(); sample++) { + // create bootstrap alignment Alignment* bootstrap_alignment; - if (aln->isSuperAlignment()) + if (aln->isSuperAlignment()) { bootstrap_alignment = new SuperAlignment; - else + } else { bootstrap_alignment = new Alignment; + } bootstrap_alignment->createBootstrapAlignment(aln, NULL, params->bootstrap_spec); // create bootstrap tree @@ -2706,55 +2707,62 @@ void IQTree::refineBootTrees() { } else { // allocate heterotachy tree if neccessary int pos = posRateHeterotachy(aln->model_name); - if (params->num_mixlen > 1) { boot_tree = new PhyloTreeMixlen(bootstrap_alignment, params->num_mixlen); } else if (pos != string::npos) { boot_tree = new PhyloTreeMixlen(bootstrap_alignment, 0); - } else + } else { boot_tree = new IQTree(bootstrap_alignment); + } } boot_tree->on_refine_btree = true; boot_tree->save_all_trees = 0; - // initialize constraint tree - if (!constraintTree.empty()) { - boot_tree->constraintTree.readConstraint(constraintTree); - } - // set likelihood kernel boot_tree->setParams(params); boot_tree->setLikelihoodKernel(sse); boot_tree->setNumThreads(num_threads); // 2019-06-03: bug fix setting part_info properly - if (boot_tree->isSuperTree()) + if (boot_tree->isSuperTree()) { ((PhyloSuperTree*)boot_tree)->setPartInfo((PhyloSuperTree*)this); + } + + // initialize constraint tree + if (!constraintTree.empty()) { + boot_tree->constraintTree.readConstraint(constraintTree); + } + + // load the current ufboot tree + // 2019-02-06: fix crash with -sp and -bnni + // 2025-03-20: fix crash with -rt and -bnni (tree should be read before model) + if (isSuperTree()) { + boot_tree->PhyloTree::readTreeString(boot_trees[sample]); + } else { + boot_tree->readTreeString(boot_trees[sample]); + } // copy model // BQM 2019-05-31: bug fix with -bsam option + VerboseMode saved_mode = verbose_mode; + if (verbose_mode < VB_MAX) verbose_mode = VB_QUIET; boot_tree->initializeModel(*params, aln->model_name, models_block); + verbose_mode = saved_mode; boot_tree->getModelFactory()->setCheckpoint(getCheckpoint()); - if (isSuperTree()) + if (isSuperTree()) { ((PartitionModel*)boot_tree->getModelFactory())->PartitionModel::restoreCheckpoint(); - else + } else { boot_tree->getModelFactory()->restoreCheckpoint(); + } - // load the current ufboot tree - // 2019-02-06: fix crash with -sp and -bnni - if (isSuperTree()) - boot_tree->PhyloTree::readTreeString(boot_trees[sample]); - else - boot_tree->readTreeString(boot_trees[sample]); - + // re-initialize branch lengths for unlinked model if (boot_tree->isSuperTree() && params->partition_type == BRLEN_OPTIMIZE) { if (((PhyloSuperTree*)boot_tree)->size() > 1) { - // re-initialize branch lengths for unlinked model boot_tree->wrapperFixNegativeBranch(true); } } - + // TODO: check if this resolves the crash in reorientPartialLh() boot_tree->initializeAllPartialLh(); @@ -2806,8 +2814,8 @@ void IQTree::refineBootTrees() { checkpoint->dump(); - } - + } + delete models_block; cout << "Total " << refined_trees << " ufboot trees refined" << endl; diff --git a/tree/phylosupertree.cpp b/tree/phylosupertree.cpp index dcde7a0ed..7de4b32be 100644 --- a/tree/phylosupertree.cpp +++ b/tree/phylosupertree.cpp @@ -1458,16 +1458,23 @@ void PhyloSuperTree::endMarginalAncestralState(bool orig_kernel_nonrev, } void PhyloSuperTree::writeSiteLh(ostream &out, SiteLoglType wsl, int partid) { - int part = 1; - for (auto it = begin(); it != end(); ++it, ++part) - (*it)->writeSiteLh(out, wsl, part); + ASSERT(partid == -1); partid = 1; + for (iterator it = begin(); it != end(); ++it, ++partid) { + (*it)->writeSiteLh(out, wsl, partid); + } } -void PhyloSuperTree::writeSiteRates(ostream &out, bool bayes, int partid) { +void PhyloSuperTree::writeSiteFreqs(ostream &out, int partid) { + ASSERT(partid == -1); partid = 1; + for (iterator it = begin(); it != end(); ++it, ++partid) { + (*it)->writeSiteFreqs(out, partid); + } +} - int part = 1; - for (iterator it = begin(); it != end(); ++it, ++part) { - (*it)->writeSiteRates(out, bayes, part); +void PhyloSuperTree::writeSiteRates(ostream &out, bool bayes, int partid) { + ASSERT(partid == -1); partid = 1; + for (iterator it = begin(); it != end(); ++it, ++partid) { + (*it)->writeSiteRates(out, bayes, partid); } } diff --git a/tree/phylosupertree.h b/tree/phylosupertree.h index 60dbdcb9a..04cb2f7ea 100644 --- a/tree/phylosupertree.h +++ b/tree/phylosupertree.h @@ -439,17 +439,27 @@ class PhyloSuperTree : public IQTree, public vector end computing ancestral sequence probability for an internal node by marginal reconstruction */ virtual void endMarginalAncestralState(bool orig_kernel_nonrev, double* &ptn_ancestral_prob, int* &ptn_ancestral_seq); - + + /** + write site state frequencies to a file in the following format: + 1 freq(A)_1 freq(R)_1 ... + 2 freq(A)_2 freq(R)_2 ... + ... + This function should be used by -wsf option + @param out output stream to write freqs + */ + virtual void writeSiteFreqs(ostream &out, int partid = -1); + /** - write site-rates to a file in the following format: - 1 rate_1 - 2 rate_2 - .... - This function will call computePatternRates() - @param out output stream to write rates - @param bayes TRUE to use empirical Bayesian, false for ML method - */ - virtual void writeSiteRates(ostream &out, bool bayes, int partid = -1); + write site rates to a file in the following format: + 1 rate_1 + 2 rate_2 + ... + This function should be used by -wsr option + @param out output stream to write rates + @param bayes TRUE to use empirical Bayesian, false for ML method + */ + virtual void writeSiteRates(ostream &out, bool bayes, int partid = -1); /** write site log likelihood to a output stream diff --git a/tree/phylotree.cpp b/tree/phylotree.cpp index b6bfcb595..e20ccff21 100644 --- a/tree/phylotree.cpp +++ b/tree/phylotree.cpp @@ -1395,66 +1395,105 @@ double PhyloTree::computePatternLhCat(SiteLoglType wsl) { return score; } -void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { - ASSERT(getModel()->isMixture()); - computePatternLhCat(WSL_MIXTURE); - double *lh_cat = _pattern_lh_cat; - size_t nptn = getAlnNPattern(); - size_t nmixture = getModel()->getNMixtures(); - double *ptn_freq = ptn_state_freq; - size_t nstates = aln->num_states; -// ModelMixture *models = (ModelMixture*)model; - - if (params->print_site_state_freq == WSF_POSTERIOR_MEAN) { - cout << "Computing posterior mean site frequencies...." << endl; - // loop over all site-patterns - for (size_t ptn = 0; ptn < nptn; ++ptn) { - - // first compute posterior for each mixture component - double sum_lh = 0.0; - for (size_t m = 0; m < nmixture; ++m) { - sum_lh += lh_cat[m]; - } - sum_lh = 1.0/sum_lh; - for (size_t m = 0; m < nmixture; ++m) { - lh_cat[m] *= sum_lh; - } - - // now compute state frequencies - for (size_t state = 0; state < nstates; ++state) { - double freq = 0; - for (size_t m = 0; m < nmixture; ++m) - freq += model->getMixtureClass(m)->state_freq[state] * lh_cat[m]; - ptn_freq[state] = freq; - } - - // increase the pointers - lh_cat += nmixture; - ptn_freq += nstates; - } - } else if (params->print_site_state_freq == WSF_POSTERIOR_MAX) { - cout << "Computing posterior max site frequencies...." << endl; - // loop over all site-patterns - for (size_t ptn = 0; ptn < nptn; ++ptn) { - // first compute posterior for each mixture component - size_t max_comp = 0; - for (size_t m = 1; m < nmixture; ++m) - if (lh_cat[m] > lh_cat[max_comp]) { - max_comp = m; - } - - // now compute state frequencies - memcpy(ptn_freq, model->getMixtureClass(max_comp)->state_freq, nstates*sizeof(double)); - - // increase the pointers - lh_cat += nmixture; - ptn_freq += nstates; - } - } +void PhyloTree::computePatternStateFreq(double* &all_ptn_state_freq, IntVector *ptn_cat, DoubleVector *ptn_pp, int *ncat) { + if (!getModel()->isMixture()) return; + string type; + if (params->site_state_freq_type == WSF_POSTERIOR_MEAN) type = "mean"; + else if (params->site_state_freq_type == WSF_POSTERIOR_MAX) type = "max"; + else ASSERT(false); + // prepare variables + size_t nmixture = getModel()->getNMixtures(); + if (ncat) *ncat = nmixture; + if (ptn_cat) ptn_cat->clear(); + if (ptn_pp) ptn_pp->clear(); + size_t nptn = getAlnNPattern(); + size_t nstates = aln->num_states; + ASSERT(!all_ptn_state_freq); + all_ptn_state_freq = new double[nptn * nstates]; + // start computation + cout << "Computing posterior " << type << " site frequencies..." << endl; + computePatternLhCat(WSL_MIXTURE); + double *lh_cat = _pattern_lh_cat; + double *state_freqs = all_ptn_state_freq; + // loop over all alignment patterns + for (size_t ptn = 0; ptn < nptn; ++ptn) { + // find max weight category and compute posterior normalization sum + size_t max_cat = 0; + double max_lh = 0.0; + double sum_lh = 0.0; + for (size_t m = 0; m < nmixture; ++m) { + if (lh_cat[m] > max_lh || + (lh_cat[m] == max_lh && random_double() < 0.5)) { // break the tie randomly + max_cat = m; + max_lh = lh_cat[m]; + } + sum_lh += lh_cat[m]; + } + // fill ptn_cat, ptn_pp and state_freqs + if (ptn_cat) ptn_cat->push_back(max_cat); + if (ptn_pp) ptn_pp->push_back(max_lh / sum_lh); + if (type == "mean") + for (size_t x = 0; x < nstates; ++x) { + double freq = 0.0; + for (size_t m = 0; m < nmixture; ++m) + freq += getModel()->getMixtureClass(m)->state_freq[x] * lh_cat[m]; + state_freqs[x] = freq / sum_lh; + } + else + memcpy(state_freqs, getModel()->getMixtureClass(max_cat)->state_freq, nstates*sizeof(double)); + // increase the pointers + lh_cat += nmixture; + state_freqs += nstates; + } +} + +void PhyloTree::computePatternRate(DoubleVector &ptn_rate, IntVector *ptn_cat, DoubleVector *ptn_pp, int *ncat) { + if (!getRate()->isMixture()) return; + string type; + if (params->site_rate_type == WSR_POSTERIOR_MEAN) type = "mean"; + else if (params->site_rate_type == WSR_POSTERIOR_MAX) type = "max"; + else ASSERT(false); + // prepare variables + size_t ncategory = getRate()->getNRate(); + size_t nextra = (getRate()->getPInvar()) ? 1 : 0; + if (ncat) *ncat = ncategory + nextra; + if (ptn_cat) ptn_cat->clear(); + if (ptn_pp) ptn_pp->clear(); + size_t nptn = getAlnNPattern(); + ASSERT(ptn_rate.empty()); + ptn_rate.resize(nptn); + // start computation + cout << "Computing posterior " << type << " site rates..." << endl; + computePatternLhCat(WSL_RATECAT); + double *lh_cat = _pattern_lh_cat; + // loop over all alignment patterns + for (size_t ptn = 0; ptn < nptn; ++ptn) { + // find max weight category and compute posterior normalization sum + size_t max_cat = 0; // if pinvar: 0=invar, 1=slow, etc.; else: 0=slow, etc. + double max_rate = 0.0, max_lh = ptn_invar[ptn]; + double mean_rate = 0.0, sum_lh = ptn_invar[ptn]; + for (size_t c = 0; c < ncategory; ++c) { + if (lh_cat[c] > max_lh || + (lh_cat[c] == max_lh && random_double() < 0.5)) { // break the tie randomly + max_cat = c + nextra; + max_rate = getRate()->getRate(c); + max_lh = lh_cat[c]; + } + mean_rate += getRate()->getRate(c) * lh_cat[c]; + sum_lh += lh_cat[c]; + } + // fill ptn_cat, ptn_pp and ptn_rate + if (ptn_cat) ptn_cat->push_back(max_cat); + if (ptn_pp) ptn_pp->push_back(max_lh / sum_lh); + if (type == "mean") + ptn_rate[ptn] = mean_rate / sum_lh; + else + ptn_rate[ptn] = max_rate; + // increase the pointers + lh_cat += ncategory; + } } - - void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) { /* if (!dad_branch) { dad_branch = (PhyloNeighbor*) root->neighbors[0]; @@ -2536,6 +2575,8 @@ void PhyloTree::computeFuncDerv(double value, double &df, double &ddf) { } void PhyloTree::optimizePatternRates(DoubleVector &pattern_rates) { + ASSERT(pattern_rates.empty()); + cout << "Computing ML site rates..." << endl; size_t nptn = aln->getNPattern(); pattern_rates.resize(nptn, 1.0); #pragma omp parallel for @@ -6064,55 +6105,106 @@ void PhyloTree::writeSiteLh(ostream &out, SiteLoglType wsl, int partid) { aligned_free(pattern_lh); } +void PhyloTree::writeSiteFreqs(ostream &out, int partid) { + double *ptn_state_freqs = nullptr; + IntVector ptn_cat; + DoubleVector ptn_pp; + int ncat = 1; + // try to compute ptn_state_freqs + computePatternStateFreq(ptn_state_freqs, &ptn_cat, &ptn_pp, &ncat); + if (!ptn_state_freqs) { + if (partid >= 0) + cout << "Part " << partid << ": "; + cout << "No frequency mixture model specified" << endl; + return; + } + // write into outstream + out.setf(ios::fixed, ios::floatfield); + out.precision(5); + IntVector cat_cnt; + cat_cnt.resize(ncat, 0); + size_t nsites = getAlnNSite(); + size_t nstates = aln->num_states; + for (size_t site = 0; site < nsites; ++site) { + int ptn = aln->getPatternID(site); + if (partid >= 0) + out << partid << "\t"; + out << site + 1; + double *state_freqs = &ptn_state_freqs[ptn*nstates]; + for (size_t x = 0; x < nstates; ++x) { + out << "\t" << state_freqs[x]; + } + if (!ptn_cat.empty()) { + string site_cat; + site_cat = getModel()->getMixtureClass(ptn_cat[ptn])->name; + out << "\t" << site_cat; + if (!ptn_pp.empty()) + out << "\t" << ptn_pp[ptn]; + cat_cnt[ptn_cat[ptn]] ++; + } + out << endl; + } + delete [] ptn_state_freqs; + // print info to logs + if (partid >= 0) + cout << "Part " << partid << ": "; + cout << "Empirical proportions for each class:"; + for (size_t m = 0; m < cat_cnt.size(); ++m) + cout << " " << ((double)cat_cnt[m]) / nsites; + cout << endl; +} + void PhyloTree::writeSiteRates(ostream &out, bool bayes, int partid) { - DoubleVector pattern_rates; - IntVector pattern_cat; - int ncategory = 1; - - if (bayes) - ncategory = site_rate->computePatternRates(pattern_rates, pattern_cat); - else - optimizePatternRates(pattern_rates); - - if (pattern_rates.empty()) return; - size_t nsite = aln->getNSite(); - - out.setf(ios::fixed,ios::floatfield); - out.precision(5); - //cout << __func__ << endl; - IntVector count; - count.resize(ncategory, 0); - for (size_t i = 0; i < nsite; ++i) { - int ptn = aln->getPatternID(i); - if (partid >= 0) - out << partid << "\t"; - out << i+1 << "\t"; - if (pattern_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << pattern_rates[ptn]; - //cout << i << " "<< ptn << " " << pattern_cat[ptn] << endl; - if (!pattern_cat.empty()) { - int site_cat; - double cat_rate; - if (site_rate->getPInvar() == 0.0) { - site_cat = pattern_cat[ptn]+1; - cat_rate = site_rate->getRate(pattern_cat[ptn]); - } else { - site_cat = pattern_cat[ptn]; - if (site_cat == 0) - cat_rate = 0.0; - else - cat_rate = site_rate->getRate(pattern_cat[ptn]-1); - } - out << "\t" << site_cat << "\t" << cat_rate; - count[pattern_cat[ptn]]++; - } - out << endl; - } - if (bayes) { - cout << "Empirical proportions for each category:"; - for (size_t i = 0; i < count.size(); ++i) - cout << " " << ((double)count[i])/nsite; - cout << endl; - } + DoubleVector ptn_rates; + IntVector ptn_cat; + DoubleVector ptn_pp; + int ncat = 1; + // get precomputed ptn_rates of Meyer & von Haeseler (2003) + if (getRate()->isSiteSpecificRate() || getRate()->getPtnCat(0) != -1) + ncat = getRate()->getPatternRates(ptn_rates, ptn_cat); + // try to compute ptn_rates + else if (bayes) computePatternRate(ptn_rates, &ptn_cat, &ptn_pp, &ncat); + else optimizePatternRates(ptn_rates); + if (ptn_rates.empty()) { + if (partid >= 0) + cout << "Part " << partid << ": "; + cout << "No rate mixture model specified" << endl; + return; + } + // write into outstream + out.setf(ios::fixed, ios::floatfield); + out.precision(5); + IntVector cat_cnt; + cat_cnt.resize(ncat, 0); + size_t nsites = getAlnNSite(); + for (size_t site = 0; site < nsites; ++site) { + int ptn = aln->getPatternID(site); + if (partid >= 0) + out << partid << "\t"; + out << site + 1 << "\t"; + if (ptn_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << ptn_rates[ptn]; + if (!ptn_cat.empty()) { + int site_cat; // always: 0=invar, 1=slow, etc. + double cat_rate; + site_cat = (getRate()->getPInvar()) ? ptn_cat[ptn] : ptn_cat[ptn] + 1; + if (site_cat == 0) cat_rate = 0.0; + else cat_rate = getRate()->getRate(site_cat - 1); + out << "\t" << site_cat << "\t" << cat_rate; + if (!ptn_pp.empty()) + out << "\t" << ptn_pp[ptn]; + cat_cnt[ptn_cat[ptn]] ++; + } + out << endl; + } + // print info to logs + if (bayes) { + if (partid >= 0) + cout << "Part " << partid << ": "; + cout << "Empirical proportions for each category:"; + for (size_t c = 0; c < cat_cnt.size(); ++c) + cout << " " << ((double)cat_cnt[c]) / nsites; + cout << endl; + } } void PhyloTree::writeBranches(ostream &out) { diff --git a/tree/phylotree.h b/tree/phylotree.h index b8a19f986..e6bda3fe1 100644 --- a/tree/phylotree.h +++ b/tree/phylotree.h @@ -1058,12 +1058,26 @@ class PhyloTree : public MTree, public Optimization, public CheckpointFactory { */ virtual double computePatternLhCat(SiteLoglType wsl); - /** - compute state frequency for each pattern (for Huaichun) - @param[out] ptn_state_freq state frequency vector per pattern, - should be pre-allocated with size of num_patterns * num_states - */ - void computePatternStateFreq(double *ptn_state_freq); + /** + compute state frequencies for each pattern (for site-specific models and -wsf option) + @param[out] all_ptn_state_freq state frequencies per pattern, + should be passed as a NULL pointer, + will be allocated with the size of num_patterns * num_states + @param[out] ptn_cat (optional) the best-fitting mixture component per pattern + @param[out] ptn_pp (optional) PP of the best-fitting mixture component per pattern + @param[out] ncat (optional) the number of mixture components + */ + void computePatternStateFreq(double* &all_ptn_state_freq, IntVector *ptn_cat = NULL, DoubleVector *ptn_pp = NULL, int *ncat = NULL); + + /** + compute rates for each pattern (for site-specific models and -wsr option) + @param[out] ptn_rate rate scaler per pattern, + should be passed as an empty vector + @param[out] ptn_cat (optional) the best-fitting rate category per pattern + @param[out] ptn_pp (optional) PP of the best-fitting rate category per pattern + @param[out] ncat (optional) the number of rate categories + */ + void computePatternRate(DoubleVector &ptn_rate, IntVector *ptn_cat = NULL, DoubleVector *ptn_pp = NULL, int *ncat = NULL); /**************************************************************************** ancestral sequence reconstruction @@ -2231,15 +2245,24 @@ class PhyloTree : public MTree, public Optimization, public CheckpointFactory { */ void forceConvertingToUnrooted(); + /** + write site state frequencies to a file in the following format: + 1 freq(A)_1 freq(R)_1 ... + 2 freq(A)_2 freq(R)_2 ... + ... + This function should be used by -wsf option + @param out output stream to write freqs + */ + virtual void writeSiteFreqs(ostream &out, int partid = -1); /** - write site-rates to a file in the following format: - 1 rate_1 - 2 rate_2 - .... - This function will call computePatternRates() - @param out output stream to write rates - @param bayes TRUE to use empirical Bayesian, false for ML method + write site rates to a file in the following format: + 1 rate_1 + 2 rate_2 + ... + This function should be used by -wsr option + @param out output stream to write rates + @param bayes TRUE to use empirical Bayesian, false for ML method */ virtual void writeSiteRates(ostream &out, bool bayes, int partid = -1); diff --git a/utils/tools.cpp b/utils/tools.cpp index a7e91dc2f..259ef7870 100644 --- a/utils/tools.cpp +++ b/utils/tools.cpp @@ -1265,8 +1265,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.print_partition_lh = false; params.print_marginal_prob = false; params.print_site_prob = WSL_NONE; - params.print_site_state_freq = WSF_NONE; + params.print_site_state_freq = 0; params.print_site_rate = 0; + params.site_state_freq_type = WSF_NONE; + params.site_rate_type = WSR_NONE; params.print_trees_site_posterior = 0; params.print_ancestral_sequence = AST_NONE; params.min_ancestral_prob = 0.0; @@ -1366,7 +1368,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.bootlh_test = 0; params.bootlh_partitions = NULL; params.site_freq_file = NULL; + params.site_rate_file = NULL; params.tree_freq_file = NULL; + params.tree_rate_file = NULL; params.num_threads = 1; params.num_threads_max = 10000; params.openmp_by_model = false; @@ -1376,6 +1380,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.root_state = NULL; params.print_bootaln = false; params.print_boot_site_freq = false; + params.print_boot_site_rate = false; params.print_subaln = false; params.print_partition_info = false; params.print_conaln = false; @@ -1863,14 +1868,15 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.tree_gen = YULE_HARDING; continue; } - if (strcmp(argv[cnt], "-rs") == 0) { - cnt++; - if (cnt >= argc) - throw "Use -rs "; - params.tree_gen = YULE_HARDING; - params.aln_file = argv[cnt]; - continue; - } + // OBSOLETE: alignment should be passed with the -s option + //if (strcmp(argv[cnt], "-rs") == 0) { + // cnt++; + // if (cnt >= argc) + // throw "Use -rs "; + // params.tree_gen = YULE_HARDING; + // params.aln_file = argv[cnt]; + // continue; + //} if (strcmp(argv[cnt], "-rstar") == 0) { cnt++; if (cnt >= argc) @@ -2867,15 +2873,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { throw " must be positive!"; continue; } - if (strcmp(argv[cnt], "--site-rate") == 0) { + if (strcmp(argv[cnt], "--site-rate-type") == 0) { cnt++; if (cnt >= argc) - throw "Use --site-rate MEAN/SAMPLING/MODEL"; - + throw "Use --site-rate-type MEAN/SAMPLING/MODEL"; // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); - if (option.compare("MEAN") == 0) params.alisim_rate_heterogeneity = POSTERIOR_MEAN; else if (option.compare("SAMPLING") == 0) @@ -2883,18 +2887,16 @@ void parseArg(int argc, char *argv[], Params ¶ms) { else if (option.compare("MODEL") == 0) params.alisim_rate_heterogeneity = UNSPECIFIED; else - throw "Use --site-rate MEAN/SAMPLING/MODEL"; + throw "Use --site-rate-type MEAN/SAMPLING/MODEL"; continue; } - if (strcmp(argv[cnt], "--site-freq") == 0) { + if (strcmp(argv[cnt], "--site-freq-type") == 0) { cnt++; if (cnt >= argc) - throw "Use --site-freq MEAN/SAMPLING/MODEL"; - + throw "Use --site-freq-type MEAN/SAMPLING/MODEL"; // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); - if (option.compare("MEAN") == 0) params.alisim_stationarity_heterogeneity = POSTERIOR_MEAN; else if (option.compare("SAMPLING") == 0) @@ -2902,7 +2904,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { else if (option.compare("MODEL") == 0) params.alisim_stationarity_heterogeneity = UNSPECIFIED; else - throw "Use --site-freq MEAN/SAMPLING/MODEL"; + throw "Use --site-freq-type MEAN/SAMPLING/MODEL"; continue; } if (strcmp(argv[cnt], "--indel") == 0) { @@ -3282,7 +3284,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.model_subset = argv[cnt]; continue; } - if (strcmp(argv[cnt], "-mfreq") == 0 || strcmp(argv[cnt], "--freqs") == 0) { + if (strcmp(argv[cnt], "-mfreq") == 0 || strcmp(argv[cnt], "--mfreq") == 0 || strcmp(argv[cnt], "--freqs") == 0) { cnt++; if (cnt >= argc) throw "Use -mfreq "; @@ -3477,29 +3479,81 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } + /** Specify precomputed site-specific model */ if (strcmp(argv[cnt], "-fs") == 0 || strcmp(argv[cnt], "--site-freq") == 0) { - if (params.tree_freq_file) - throw "Specifying both -fs and -ft not allowed"; + if (params.tree_freq_file) + throw "Specifying both -fs and -ft or -frt not allowed"; cnt++; if (cnt >= argc) throw "Use -fs "; params.site_freq_file = argv[cnt]; -// params.SSE = LK_EIGEN; + //params.SSE = LK_EIGEN; + continue; + } + if (strcmp(argv[cnt], "-rs") == 0 || strcmp(argv[cnt], "--site-rate") == 0) { + if (params.tree_freq_file || params.tree_rate_file) + throw "Specifying both -rs and -ft or -rt or -frt not allowed"; + cnt++; + if (cnt >= argc) + throw "Use -rs "; + params.site_rate_file = argv[cnt]; continue; } + + /** Compute new site-specific model */ if (strcmp(argv[cnt], "-ft") == 0 || strcmp(argv[cnt], "--tree-freq") == 0) { - if (params.site_freq_file) - throw "Specifying both -fs and -ft not allowed"; - cnt++; + if (params.site_freq_file || params.site_rate_file) + throw "Specifying both -fs or -rs and -ft not allowed"; + if (params.tree_rate_file) + throw "Specifying both -ft and -rt not allowed, try using -frt option"; + cnt++; if (cnt >= argc) throw "Use -ft "; - params.tree_freq_file = argv[cnt]; - if (params.print_site_state_freq == WSF_NONE) - params.print_site_state_freq = WSF_POSTERIOR_MEAN; - continue; - } + params.tree_freq_file = argv[cnt]; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; + continue; + } + if (strcmp(argv[cnt], "-rt") == 0 || strcmp(argv[cnt], "--tree-rate") == 0) { + if (params.site_rate_file) + throw "Specifying both -rs and -rt not allowed"; + if (params.tree_freq_file) + throw "Specifying both -ft and -rt not allowed, try using -frt option"; + cnt++; + if (cnt >= argc) + throw "Use -rt "; + params.tree_rate_file = argv[cnt]; + if (params.site_rate_type == WSR_NONE) + params.site_rate_type = WSR_POSTERIOR_MEAN; + continue; + } + if (strcmp(argv[cnt], "-frt") == 0 || strcmp(argv[cnt], "--tree-freq-rate") == 0) { + if (params.site_freq_file || params.site_rate_file) + throw "Specifying both -fs or -rs and -frt not allowed"; + if (params.tree_freq_file || params.tree_rate_file) + throw "Specifying both -ft or -rt and -frt not allowed"; + cnt++; + if (cnt >= argc) + throw "Use -frt "; + params.tree_freq_file = params.tree_rate_file = argv[cnt]; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; + if (params.site_rate_type == WSR_NONE) + params.site_rate_type = WSR_POSTERIOR_MEAN; + continue; + } + // use max instead of mean approximation for site-specific parameter computation, + // applies also to -wsf and -wsr options + if (strcmp(argv[cnt], "-fmax") == 0 || strcmp(argv[cnt], "--freq-max") == 0) { + params.site_state_freq_type = WSF_POSTERIOR_MAX; + continue; + } + if (strcmp(argv[cnt], "-rmax") == 0 || strcmp(argv[cnt], "--rate-max") == 0) { + params.site_rate_type = WSR_POSTERIOR_MAX; + continue; + } - if (strcmp(argv[cnt], "-fconst") == 0) { + if (strcmp(argv[cnt], "-fconst") == 0 || strcmp(argv[cnt], "--fconst") == 0) { cnt++; if (cnt >= argc) throw "Use -fconst "; @@ -3901,20 +3955,23 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.print_tree_lh = true; continue; } + if (strcmp(argv[cnt], "-wit") == 0) { + params.write_init_tree = true; + continue; + } + + /** write branch lengths */ + // general BL if (strcmp(argv[cnt], "-wbl") == 0) { params.print_branch_lengths = true; continue; } - if (strcmp(argv[cnt], "-wit") == 0) { - params.write_init_tree = true; - continue; - } - - if (strcmp(argv[cnt], "--write-branches") == 0) { - params.write_branches = true; - continue; - } - + // partition-specific BL + if (strcmp(argv[cnt], "-wpbl") == 0 || strcmp(argv[cnt], "--write-branches") == 0) { + params.write_branches = true; + continue; + } + // if (strcmp(argv[cnt], "-nodup") == 0) { // params.avoid_duplicated_trees = true; // continue; @@ -4015,11 +4072,11 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } + /** Mixture model site LL */ if (strcmp(argv[cnt], "-wslg") == 0 || strcmp(argv[cnt], "-wslr") == 0) { params.print_site_lh = WSL_RATECAT; continue; } - if (strcmp(argv[cnt], "-wslm") == 0) { params.print_site_lh = WSL_MIXTURE; continue; @@ -4029,11 +4086,11 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } + /** Mixture model site probas */ if (strcmp(argv[cnt], "-wspr") == 0) { params.print_site_prob = WSL_RATECAT; continue; } - if (strcmp(argv[cnt], "-wspm") == 0) { params.print_site_prob = WSL_MIXTURE; continue; @@ -4043,51 +4100,54 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } + if (strcmp(argv[cnt], "-wsptrees") == 0) { + params.print_trees_site_posterior = 1; + continue; + } + + /** Ancestral sequence reconstruction */ if (strcmp(argv[cnt], "-asr") == 0 || strcmp(argv[cnt], "--ancestral") == 0) { params.print_ancestral_sequence = AST_MARGINAL; - params.ignore_identical_seqs = false; + params.ignore_identical_seqs = false; continue; } - if (strcmp(argv[cnt], "-asr-min") == 0 || strcmp(argv[cnt], "--asr-min") == 0) { - cnt++; + cnt++; if (cnt >= argc) throw "Use -asr-min "; - - params.min_ancestral_prob = convert_double(argv[cnt]); - if (params.min_ancestral_prob < 0 || params.min_ancestral_prob > 1) - throw "Minimum ancestral probability [-asr-min] must be between 0 and 1.0"; - continue; - } - + params.min_ancestral_prob = convert_double(argv[cnt]); + if (params.min_ancestral_prob < 0 || params.min_ancestral_prob > 1) + throw "Minimum ancestral probability [-asr-min] must be between 0 and 1.0"; + continue; + } if (strcmp(argv[cnt], "-asr-joint") == 0) { params.print_ancestral_sequence = AST_JOINT; - params.ignore_identical_seqs = false; + params.ignore_identical_seqs = false; continue; } - if (strcmp(argv[cnt], "-wsr") == 0 || strcmp(argv[cnt], "--rate") == 0) { - params.print_site_rate |= 1; + /** Mixture model site state freqs */ + if (strcmp(argv[cnt], "-wsf") == 0 || strcmp(argv[cnt], "--freq") == 0) { + params.print_site_state_freq = 1; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; continue; } - if (strcmp(argv[cnt], "--mlrate") == 0) { - params.print_site_rate |= 2; - continue; - } - - if (strcmp(argv[cnt], "-wsptrees") == 0) { - params.print_trees_site_posterior = 1; - continue; - } - if (strcmp(argv[cnt], "-wsf") == 0) { - params.print_site_state_freq = WSF_POSTERIOR_MEAN; + /** Mixture model site rates */ + if (strcmp(argv[cnt], "-wsr") == 0 || strcmp(argv[cnt], "--rate") == 0) { + params.print_site_rate |= 1; + if (params.site_rate_type == WSR_NONE) + params.site_rate_type = WSR_POSTERIOR_MEAN; continue; } - if (strcmp(argv[cnt], "--freq-max") == 0 || strcmp(argv[cnt], "-fmax") == 0) { - params.print_site_state_freq = WSF_POSTERIOR_MAX; + + /** ML-optimized site rates */ + if (strcmp(argv[cnt], "--mlrate") == 0) { + params.print_site_rate |= 2; continue; } + if (strcmp(argv[cnt], "-wba") == 0) { params.print_bootaln = true; continue; @@ -4096,6 +4156,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.print_boot_site_freq = true; continue; } + if (strcmp(argv[cnt], "-wbsr") == 0) { + params.print_boot_site_rate = true; + continue; + } if (strcmp(argv[cnt], "-wsa") == 0) { params.print_subaln = true; continue; @@ -5804,11 +5868,12 @@ void parseArg(int argc, char *argv[], Params ¶ms) { if (params.alisim_active && !params.aln_file && !params.user_file && !params.partition_file && params.tree_gen == NONE) outError("A tree filepath is a mandatory input to execute AliSim when neither Inference mode nor Random mode (generating a random tree) is inactive. Use -t ; or Activate the inference mode by -s ; or Activate Random mode by -t RANDOM{,} where is yh, u, cat, bal, bd{,} stands for Yule-Harding, Uniform, Caterpillar, Balanced, Birth-Death model respectively."); - // terminate if using AliSim with -ft or -fs site-specific model (ModelSet) + // terminate if using AliSim with site-specific model (ModelSet) // computeTransMatix has not yet implemented for ModelSet if (params.alisim_active && (params.tree_freq_file || params.site_freq_file)) - outError("Sorry! `-ft` (--site-freq) and `-fs` (--tree-freq) options are not fully supported in AliSim. However, AliSim can estimate posterior mean frequencies from the alignment. Please try again without `-ft` and `-fs` options!"); - + outError("Sorry! `-ft` (--tree-freq) and `-fs` (--site-freq) options are not fully supported in AliSim. However, AliSim can estimate posterior mean frequencies from the alignment. Please try again without `-ft` and `-fs` options!"); + if (params.alisim_active && (params.tree_rate_file || params.site_rate_file)) + outError("Sorry! `-rt` (--tree-rate) and `-rs` (--site-rate) options are not fully supported in AliSim. However, AliSim can estimate posterior mean rates from the alignment. Please try again without `-rt` and `-rs` options!"); // set default filename for the random tree if AliSim is running in Random mode if (params.alisim_active && !params.user_file && params.tree_gen != NONE) { @@ -5912,9 +5977,9 @@ void usage(char* argv[]) { cout << endl; cout << "OPTIONS FOR GENOMIC EPIDEMIOLOGICAL ANALYSES:" << endl; cout << " --pathogen Apply CMAPLE tree search algorithm if sequence" << endl; - cout << " divergence is low, otherwise, apply IQ-TREE algorithm." << endl; + cout << " divergence is low, otherwise apply IQ-TREE algorithm" << endl; cout << " --pathogen-force Apply CMAPLE tree search algorithm regardless" << endl; - cout << " of sequence divergence." << endl; + cout << " of sequence divergence" << endl; cout << endl; //cout << "HIDDEN OPTIONS: see the source code file pda.cpp::parseArg()" << endl; @@ -5922,51 +5987,53 @@ void usage(char* argv[]) { } void usage_alisim(){ - cout << endl << "ALISIM: ALIGNMENT SIMULATOR" << endl + cout + << endl << "ALISIM: ALIGNMENT SIMULATOR" << endl << endl << "Usage: iqtree --alisim [-m MODEL] [-t TREE] ..." << endl << endl - << " --alisim OUTPUT_ALIGNMENT Activate AliSim and specify the output alignment filename"<< endl - << " -t TREE_FILE Set the input tree file name" << endl - << " --length LENGTH Set the length of the root sequence" << endl - << " --num-alignments NUMBER Set the number of output datasets" << endl - << " --seqtype STRING BIN, DNA, AA, CODON, MORPH{NUM_STATES} (default: auto-detect)" << endl - << " For morphological data, 0, Set the insertion and deletion rate of the indel model,"<< endl - << " relative to the substitution rate"<< endl - << " --indel-size , Set the insertion and deletion size distributions" << endl - << " --sub-level-mixture Enable the feature to simulate substitution-level mixture model"<< endl - << " --no-unaligned Disable outputing a file of unaligned sequences "<< endl - << " when using indel models"<< endl - << " --root-seq FILE,SEQ_NAME Specify the root sequence from an alignment" << endl - << " -s FILE Specify the input sequence alignment" << endl - << " --no-copy-gaps Disable copying gaps from input alignment (default: false)" << endl - << " --site-freq