Skip to content

Commit 23c5d90

Browse files
authored
Merge pull request #45 from akleeman/cross_validation_refactor
Cross Validation Refactor
2 parents 4401eea + 86e9468 commit 23c5d90

15 files changed

+595
-362
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ set(albatross_HEADERS
4141
albatross/map_utils.h
4242
albatross/csv_utils.h
4343
albatross/core/keys.h
44+
albatross/core/dataset.h
4445
albatross/core/model.h
4546
albatross/core/model_adapter.h
4647
albatross/core/traits.h

albatross/core/dataset.h

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Copyright (C) 2018 Swift Navigation Inc.
3+
* Contact: Swift Navigation <[email protected]>
4+
*
5+
* This source is subject to the license found in the file 'LICENSE' which must
6+
* be distributed together with this source. All other rights reserved.
7+
*
8+
* THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
9+
* EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
10+
* WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
11+
*/
12+
13+
#ifndef ALBATROSS_CORE_DATASET_H
14+
#define ALBATROSS_CORE_DATASET_H
15+
16+
#include "core/distribution.h"
17+
#include "core/traits.h"
18+
#include <Eigen/Core>
19+
#include <cereal/archives/json.hpp>
20+
#include <map>
21+
#include <vector>
22+
23+
namespace albatross {
24+
25+
// A JointDistribution has a dense covariance matrix, which
26+
// contains the covariance between each variable and all others.
27+
using JointDistribution = Distribution<Eigen::MatrixXd>;
28+
29+
// We use a wrapper around DiagonalMatrix in order to make
30+
// the resulting distribution serializable
31+
using DiagonalMatrixXd =
32+
Eigen::SerializableDiagonalMatrix<double, Eigen::Dynamic>;
33+
// A MarginalDistribution has only a digaonal covariance
34+
// matrix, so in turn only describes the variance of each
35+
// variable independent of all others.
36+
using MarginalDistribution = Distribution<DiagonalMatrixXd>;
37+
38+
/*
39+
* A RegressionDataset holds two vectors of data, the features
40+
* where a single feature can be any class that contains the information used
41+
* to make predictions of the target. This is called a RegressionDataset since
42+
* it is assumed that each feature is regressed to a single double typed
43+
* target.
44+
*/
45+
template <typename FeatureType> struct RegressionDataset {
46+
std::vector<FeatureType> features;
47+
MarginalDistribution targets;
48+
std::map<std::string, std::string> metadata;
49+
50+
RegressionDataset(){};
51+
52+
RegressionDataset(const std::vector<FeatureType> &features_,
53+
const MarginalDistribution &targets_)
54+
: features(features_), targets(targets_) {
55+
// If the two inputs aren't the same size they clearly aren't
56+
// consistent.
57+
assert(static_cast<int>(features.size()) ==
58+
static_cast<int>(targets.size()));
59+
}
60+
61+
RegressionDataset(const std::vector<FeatureType> &features_,
62+
const Eigen::VectorXd &targets_)
63+
: RegressionDataset(features_, MarginalDistribution(targets_)) {}
64+
65+
bool operator==(const RegressionDataset &other) const {
66+
return (features == other.features && targets == other.targets &&
67+
metadata == other.metadata);
68+
}
69+
70+
template <class Archive>
71+
typename std::enable_if<valid_in_out_serializer<FeatureType, Archive>::value,
72+
void>::type
73+
serialize(Archive &archive) {
74+
archive(cereal::make_nvp("features", features));
75+
archive(cereal::make_nvp("targets", targets));
76+
archive(cereal::make_nvp("metadata", metadata));
77+
}
78+
79+
template <class Archive>
80+
typename std::enable_if<!valid_in_out_serializer<FeatureType, Archive>::value,
81+
void>::type
82+
serialize(Archive &archive) {
83+
static_assert(delay_static_assert<Archive>::value,
84+
"In order to serialize a RegressionDataset the corresponding "
85+
"FeatureType must be serializable.");
86+
}
87+
};
88+
89+
} // namespace albatross
90+
91+
#endif

albatross/core/distribution.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include "cereal/cereal.hpp"
1717
#include "core/traits.h"
1818
#include "eigen/serializable_diagonal_matrix.h"
19-
#include "indexing.h"
2019
#include <Eigen/Core>
2120
#include <iostream>
2221
#include <map>
@@ -109,18 +108,6 @@ using DiagonalMatrixXd =
109108
// variable independent of all others.
110109
using MarginalDistribution = Distribution<DiagonalMatrixXd>;
111110

112-
template <typename CovarianceType, typename SizeType>
113-
Distribution<CovarianceType> subset(const std::vector<SizeType> &indices,
114-
const Distribution<CovarianceType> &dist) {
115-
auto mean = subset(indices, Eigen::VectorXd(dist.mean));
116-
if (dist.has_covariance()) {
117-
auto cov = symmetric_subset(indices, dist.covariance);
118-
return Distribution<CovarianceType>(mean, cov);
119-
} else {
120-
return Distribution<CovarianceType>(mean);
121-
}
122-
}
123-
124111
} // namespace albatross
125112

126113
#endif

albatross/core/indexing.h

Lines changed: 174 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,23 @@
1313
#ifndef ALBATROSS_CORE_INDEXING_H
1414
#define ALBATROSS_CORE_INDEXING_H
1515

16+
#include "core/dataset.h"
1617
#include <Eigen/Core>
18+
#include <functional>
1719
#include <iostream>
20+
#include <map>
1821
#include <vector>
1922

2023
namespace albatross {
2124

25+
using s32 = int32_t;
26+
using FoldIndices = std::vector<s32>;
27+
using FoldName = std::string;
28+
using FoldIndexer = std::map<FoldName, FoldIndices>;
29+
2230
/*
2331
* Extract a subset of a standard vector.
2432
*/
25-
2633
template <typename SizeType, typename X>
2734
inline std::vector<X> subset(const std::vector<SizeType> &indices,
2835
const std::vector<X> &v) {
@@ -47,6 +54,21 @@ inline Eigen::VectorXd subset(const std::vector<SizeType> &indices,
4754
return out;
4855
}
4956

57+
/*
58+
* Extracts a subset of columns from an Eigen::Matrix
59+
*/
60+
template <typename SizeType>
61+
inline Eigen::MatrixXd subset_cols(const std::vector<SizeType> &col_indices,
62+
const Eigen::MatrixXd &v) {
63+
Eigen::MatrixXd out(v.rows(), col_indices.size());
64+
for (std::size_t i = 0; i < col_indices.size(); i++) {
65+
auto ii = static_cast<Eigen::Index>(i);
66+
auto col_index = static_cast<Eigen::Index>(col_indices[i]);
67+
out.col(ii) = v.col(col_index);
68+
}
69+
return out;
70+
}
71+
5072
/*
5173
* Extracts a subset of an Eigen::Matrix for the given row and column
5274
* indices.
@@ -90,6 +112,157 @@ symmetric_subset(const std::vector<SizeType> &indices,
90112
return subset(indices, v.diagonal()).asDiagonal();
91113
}
92114

115+
template <typename CovarianceType, typename SizeType>
116+
Distribution<CovarianceType> subset(const std::vector<SizeType> &indices,
117+
const Distribution<CovarianceType> &dist) {
118+
auto mean = subset(indices, Eigen::VectorXd(dist.mean));
119+
if (dist.has_covariance()) {
120+
auto cov = symmetric_subset(indices, dist.covariance);
121+
return Distribution<CovarianceType>(mean, cov);
122+
} else {
123+
return Distribution<CovarianceType>(mean);
124+
}
125+
}
126+
127+
/*
128+
* A combination of training and testing datasets, typically used in cross
129+
* validation.
130+
*/
131+
template <typename FeatureType> struct RegressionFold {
132+
RegressionDataset<FeatureType> train_dataset;
133+
RegressionDataset<FeatureType> test_dataset;
134+
FoldName name;
135+
FoldIndices test_indices;
136+
137+
RegressionFold(const RegressionDataset<FeatureType> &train_dataset_,
138+
const RegressionDataset<FeatureType> &test_dataset_,
139+
const FoldName &name_, const FoldIndices &test_indices_)
140+
: train_dataset(train_dataset_), test_dataset(test_dataset_), name(name_),
141+
test_indices(test_indices_){};
142+
};
143+
144+
inline FoldIndices get_train_indices(const FoldIndices &test_indices,
145+
const int n) {
146+
const s32 k = static_cast<s32>(test_indices.size());
147+
// The train indices are all the indices that are not test indices.
148+
FoldIndices train_indices(n - k);
149+
s32 train_cnt = 0;
150+
for (s32 j = 0; j < n; j++) {
151+
if (std::find(test_indices.begin(), test_indices.end(), j) ==
152+
test_indices.end()) {
153+
train_indices[train_cnt] = j;
154+
train_cnt++;
155+
}
156+
}
157+
return train_indices;
158+
}
159+
160+
/*
161+
* Each flavor of cross validation can be described by a set of
162+
* FoldIndices, which store which indices should be used for the
163+
* test cases. This function takes a map from FoldName to
164+
* FoldIndices and a dataset and creates the resulting folds.
165+
*/
166+
template <typename FeatureType>
167+
static inline std::vector<RegressionFold<FeatureType>>
168+
folds_from_fold_indexer(const RegressionDataset<FeatureType> &dataset,
169+
const FoldIndexer &groups) {
170+
// For a dataset with n features, we'll have n folds.
171+
const s32 n = static_cast<s32>(dataset.features.size());
172+
std::vector<RegressionFold<FeatureType>> folds;
173+
// For each fold, partition into train and test sets.
174+
for (const auto &pair : groups) {
175+
// These get exposed inside the returned RegressionFold and because
176+
// we'd like to prevent modification of the output from this function
177+
// from changing the input FoldIndexer we perform a copy here.
178+
const FoldName group_name(pair.first);
179+
const FoldIndices test_indices(pair.second);
180+
const auto train_indices = get_train_indices(test_indices, n);
181+
182+
std::vector<FeatureType> train_features =
183+
subset(train_indices, dataset.features);
184+
MarginalDistribution train_targets = subset(train_indices, dataset.targets);
185+
186+
std::vector<FeatureType> test_features =
187+
subset(test_indices, dataset.features);
188+
MarginalDistribution test_targets = subset(test_indices, dataset.targets);
189+
190+
assert(train_features.size() == train_targets.size());
191+
assert(test_features.size() == test_targets.size());
192+
assert(test_targets.size() + train_targets.size() == n);
193+
194+
const RegressionDataset<FeatureType> train_split(train_features,
195+
train_targets);
196+
const RegressionDataset<FeatureType> test_split(test_features,
197+
test_targets);
198+
folds.push_back(RegressionFold<FeatureType>(train_split, test_split,
199+
group_name, test_indices));
200+
}
201+
return folds;
202+
}
203+
204+
template <typename FeatureType>
205+
static inline FoldIndexer
206+
leave_one_out_indexer(const RegressionDataset<FeatureType> &dataset) {
207+
FoldIndexer groups;
208+
for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
209+
FoldName group_name = std::to_string(i);
210+
groups[group_name] = {i};
211+
}
212+
return groups;
213+
}
214+
215+
/*
216+
* Splits a dataset into cross validation folds where each fold contains all but
217+
* one predictor/target pair.
218+
*/
219+
template <typename FeatureType>
220+
static inline FoldIndexer leave_one_group_out_indexer(
221+
const RegressionDataset<FeatureType> &dataset,
222+
const std::function<FoldName(const FeatureType &)> &get_group_name) {
223+
FoldIndexer groups;
224+
for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
225+
const std::string k =
226+
get_group_name(dataset.features[static_cast<std::size_t>(i)]);
227+
// Get the existing indices if we've already encountered this group_name
228+
// otherwise initialize a new one.
229+
FoldIndices indices;
230+
if (groups.find(k) == groups.end()) {
231+
indices = FoldIndices();
232+
} else {
233+
indices = groups[k];
234+
}
235+
// Add the current index.
236+
indices.push_back(i);
237+
groups[k] = indices;
238+
}
239+
return groups;
240+
}
241+
242+
/*
243+
* Generates cross validation folds which represent leave one out
244+
* cross validation.
245+
*/
246+
template <typename FeatureType>
247+
static inline std::vector<RegressionFold<FeatureType>>
248+
leave_one_out(const RegressionDataset<FeatureType> &dataset) {
249+
return folds_from_fold_indexer<FeatureType>(
250+
dataset, leave_one_out_indexer<FeatureType>(dataset));
251+
}
252+
253+
/*
254+
* Uses a `get_group_name` function to bucket each FeatureType into
255+
* a group, then holds out one group at a time.
256+
*/
257+
template <typename FeatureType>
258+
static inline std::vector<RegressionFold<FeatureType>> leave_one_group_out(
259+
const RegressionDataset<FeatureType> &dataset,
260+
const std::function<FoldName(const FeatureType &)> &get_group_name) {
261+
const FoldIndexer indexer =
262+
leave_one_group_out_indexer<FeatureType>(dataset, get_group_name);
263+
return folds_from_fold_indexer<FeatureType>(dataset, indexer);
264+
}
265+
93266
} // namespace albatross
94267

95268
#endif

0 commit comments

Comments
 (0)