1313#ifndef ALBATROSS_CORE_INDEXING_H
1414#define ALBATROSS_CORE_INDEXING_H
1515
16+ #include " core/dataset.h"
1617#include < Eigen/Core>
18+ #include < functional>
1719#include < iostream>
20+ #include < map>
1821#include < vector>
1922
2023namespace albatross {
2124
25+ using s32 = int32_t ;
26+ using FoldIndices = std::vector<s32>;
27+ using FoldName = std::string;
28+ using FoldIndexer = std::map<FoldName, FoldIndices>;
29+
2230/*
2331 * Extract a subset of a standard vector.
2432 */
25-
2633template <typename SizeType, typename X>
2734inline std::vector<X> subset (const std::vector<SizeType> &indices,
2835 const std::vector<X> &v) {
@@ -47,6 +54,21 @@ inline Eigen::VectorXd subset(const std::vector<SizeType> &indices,
4754 return out;
4855}
4956
57+ /*
58+ * Extracts a subset of columns from an Eigen::Matrix
59+ */
60+ template <typename SizeType>
61+ inline Eigen::MatrixXd subset_cols (const std::vector<SizeType> &col_indices,
62+ const Eigen::MatrixXd &v) {
63+ Eigen::MatrixXd out (v.rows (), col_indices.size ());
64+ for (std::size_t i = 0 ; i < col_indices.size (); i++) {
65+ auto ii = static_cast <Eigen::Index>(i);
66+ auto col_index = static_cast <Eigen::Index>(col_indices[i]);
67+ out.col (ii) = v.col (col_index);
68+ }
69+ return out;
70+ }
71+
5072/*
5173 * Extracts a subset of an Eigen::Matrix for the given row and column
5274 * indices.
@@ -90,6 +112,157 @@ symmetric_subset(const std::vector<SizeType> &indices,
90112 return subset (indices, v.diagonal ()).asDiagonal ();
91113}
92114
115+ template <typename CovarianceType, typename SizeType>
116+ Distribution<CovarianceType> subset (const std::vector<SizeType> &indices,
117+ const Distribution<CovarianceType> &dist) {
118+ auto mean = subset (indices, Eigen::VectorXd (dist.mean ));
119+ if (dist.has_covariance ()) {
120+ auto cov = symmetric_subset (indices, dist.covariance );
121+ return Distribution<CovarianceType>(mean, cov);
122+ } else {
123+ return Distribution<CovarianceType>(mean);
124+ }
125+ }
126+
127+ /*
128+ * A combination of training and testing datasets, typically used in cross
129+ * validation.
130+ */
131+ template <typename FeatureType> struct RegressionFold {
132+ RegressionDataset<FeatureType> train_dataset;
133+ RegressionDataset<FeatureType> test_dataset;
134+ FoldName name;
135+ FoldIndices test_indices;
136+
137+ RegressionFold (const RegressionDataset<FeatureType> &train_dataset_,
138+ const RegressionDataset<FeatureType> &test_dataset_,
139+ const FoldName &name_, const FoldIndices &test_indices_)
140+ : train_dataset(train_dataset_), test_dataset(test_dataset_), name(name_),
141+ test_indices (test_indices_){};
142+ };
143+
144+ inline FoldIndices get_train_indices (const FoldIndices &test_indices,
145+ const int n) {
146+ const s32 k = static_cast <s32>(test_indices.size ());
147+ // The train indices are all the indices that are not test indices.
148+ FoldIndices train_indices (n - k);
149+ s32 train_cnt = 0 ;
150+ for (s32 j = 0 ; j < n; j++) {
151+ if (std::find (test_indices.begin (), test_indices.end (), j) ==
152+ test_indices.end ()) {
153+ train_indices[train_cnt] = j;
154+ train_cnt++;
155+ }
156+ }
157+ return train_indices;
158+ }
159+
160+ /*
161+ * Each flavor of cross validation can be described by a set of
162+ * FoldIndices, which store which indices should be used for the
163+ * test cases. This function takes a map from FoldName to
164+ * FoldIndices and a dataset and creates the resulting folds.
165+ */
166+ template <typename FeatureType>
167+ static inline std::vector<RegressionFold<FeatureType>>
168+ folds_from_fold_indexer (const RegressionDataset<FeatureType> &dataset,
169+ const FoldIndexer &groups) {
170+ // For a dataset with n features, we'll have n folds.
171+ const s32 n = static_cast <s32>(dataset.features .size ());
172+ std::vector<RegressionFold<FeatureType>> folds;
173+ // For each fold, partition into train and test sets.
174+ for (const auto &pair : groups) {
175+ // These get exposed inside the returned RegressionFold and because
176+ // we'd like to prevent modification of the output from this function
177+ // from changing the input FoldIndexer we perform a copy here.
178+ const FoldName group_name (pair.first );
179+ const FoldIndices test_indices (pair.second );
180+ const auto train_indices = get_train_indices (test_indices, n);
181+
182+ std::vector<FeatureType> train_features =
183+ subset (train_indices, dataset.features );
184+ MarginalDistribution train_targets = subset (train_indices, dataset.targets );
185+
186+ std::vector<FeatureType> test_features =
187+ subset (test_indices, dataset.features );
188+ MarginalDistribution test_targets = subset (test_indices, dataset.targets );
189+
190+ assert (train_features.size () == train_targets.size ());
191+ assert (test_features.size () == test_targets.size ());
192+ assert (test_targets.size () + train_targets.size () == n);
193+
194+ const RegressionDataset<FeatureType> train_split (train_features,
195+ train_targets);
196+ const RegressionDataset<FeatureType> test_split (test_features,
197+ test_targets);
198+ folds.push_back (RegressionFold<FeatureType>(train_split, test_split,
199+ group_name, test_indices));
200+ }
201+ return folds;
202+ }
203+
204+ template <typename FeatureType>
205+ static inline FoldIndexer
206+ leave_one_out_indexer (const RegressionDataset<FeatureType> &dataset) {
207+ FoldIndexer groups;
208+ for (s32 i = 0 ; i < static_cast <s32>(dataset.features .size ()); i++) {
209+ FoldName group_name = std::to_string (i);
210+ groups[group_name] = {i};
211+ }
212+ return groups;
213+ }
214+
215+ /*
216+ * Splits a dataset into cross validation folds where each fold contains all but
217+ * one predictor/target pair.
218+ */
219+ template <typename FeatureType>
220+ static inline FoldIndexer leave_one_group_out_indexer (
221+ const RegressionDataset<FeatureType> &dataset,
222+ const std::function<FoldName(const FeatureType &)> &get_group_name) {
223+ FoldIndexer groups;
224+ for (s32 i = 0 ; i < static_cast <s32>(dataset.features .size ()); i++) {
225+ const std::string k =
226+ get_group_name (dataset.features [static_cast <std::size_t >(i)]);
227+ // Get the existing indices if we've already encountered this group_name
228+ // otherwise initialize a new one.
229+ FoldIndices indices;
230+ if (groups.find (k) == groups.end ()) {
231+ indices = FoldIndices ();
232+ } else {
233+ indices = groups[k];
234+ }
235+ // Add the current index.
236+ indices.push_back (i);
237+ groups[k] = indices;
238+ }
239+ return groups;
240+ }
241+
242+ /*
243+ * Generates cross validation folds which represent leave one out
244+ * cross validation.
245+ */
246+ template <typename FeatureType>
247+ static inline std::vector<RegressionFold<FeatureType>>
248+ leave_one_out (const RegressionDataset<FeatureType> &dataset) {
249+ return folds_from_fold_indexer<FeatureType>(
250+ dataset, leave_one_out_indexer<FeatureType>(dataset));
251+ }
252+
253+ /*
254+ * Uses a `get_group_name` function to bucket each FeatureType into
255+ * a group, then holds out one group at a time.
256+ */
257+ template <typename FeatureType>
258+ static inline std::vector<RegressionFold<FeatureType>> leave_one_group_out (
259+ const RegressionDataset<FeatureType> &dataset,
260+ const std::function<FoldName(const FeatureType &)> &get_group_name) {
261+ const FoldIndexer indexer =
262+ leave_one_group_out_indexer<FeatureType>(dataset, get_group_name);
263+ return folds_from_fold_indexer<FeatureType>(dataset, indexer);
264+ }
265+
93266} // namespace albatross
94267
95268#endif
0 commit comments