diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..94242df7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +data +data/* +plots/* +wl_*.npz +*.pkl diff --git a/ComplexSOM.yaml b/ComplexSOM.yaml new file mode 100644 index 00000000..5b76fa08 --- /dev/null +++ b/ComplexSOM.yaml @@ -0,0 +1,57 @@ +metrics: [SNR_3x2] +bands: riz +training_file: data_buzzard/training.hdf5 +validation_file: data_buzzard/validation.hdf5 +output_file: example/ComplexSOM_output.txt +# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" +metrics_impl: firecrown + +run: + # This is a class name which will be looked up + ComplexSOM: + run_5: + # These settings are sent to the classifier + testing: False + bins: 5 + som_dim: [51,51] + num_groups: 200 + num_threads: 32 + group_type: 'redshift' + whiten: False + redshift_propset: 2 + data_threshold: [0,30] + sparse_frac: 1.0 + plots: True + plot_dir: 'plots' + metric: "SNR_3x2" + use_inbin: False + use_outbin: False + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + sizes: True + run_10: + # These settings are sent to the classifier + testing: False + bins: 10 + som_dim: [51,51] + num_groups: 200 + num_threads: 32 + group_type: 'redshift' + redshift_propset: 2 + data_threshold: [0,30] + sparse_frac: 1.0 + plots: True + plot_dir: 'plots' + metric: "SNR_3x2" + use_inbin: False + use_outbin: False + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + sizes: True + diff --git a/bin/challenge.py b/bin/challenge.py index 5ec60273..dcabe9e4 100755 --- a/bin/challenge.py +++ b/bin/challenge.py @@ -27,12 +27,15 @@ def main(config_yaml): # Decide if anyone needs the colors calculating and/or errors loading anyone_wants_colors = False anyone_wants_errors = False + anyone_wants_sizes = False for run in config['run'].values(): for version in run.values(): if version.get('errors'): anyone_wants_errors = True if version.get('colors'): anyone_wants_colors = True + if version.get('sizes'): + anyone_wants_sizes = True bands = config['bands'] @@ -41,14 +44,16 @@ def main(config_yaml): config['training_file'], bands, errors=anyone_wants_errors, - colors=anyone_wants_colors + colors=anyone_wants_colors, + size=anyone_wants_sizes ) validation_data = tc.load_data( config['validation_file'], bands, errors=anyone_wants_errors, - colors=anyone_wants_colors + colors=anyone_wants_colors, + size=anyone_wants_sizes ) training_z = tc.load_redshift(config['training_file']) @@ -59,7 +64,7 @@ def main(config_yaml): else: metrics_fn = tc.compute_scores - with open(config['output_file'],'w') as output_file: + with open(config['output_file'],'a') as output_file: for classifier_name, runs in config['run'].items(): for run, settings in runs.items(): scores = run_one(classifier_name, bands, settings, @@ -77,15 +82,16 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, if classifier.wants_arrays: errors = settings.get('errors') colors = settings.get('colors') - train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) - valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) + sizes = settings.get('sizes') + train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors, sizes=sizes) + valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors, sizes=sizes) print ("Executing: ", classifier_name, bands, settings) ## first check if options are valid print (settings, classifier.valid_options) for key in settings.keys(): - if key not in classifier.valid_options and key not in ['errors', 'colors']: + if key not in classifier.valid_options and key not in ['errors', 'colors', 'sizes']: raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}") print ("Initializing classifier...") @@ -102,7 +108,7 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, print ("Making some pretty plots...") name = str(classifier.__name__) - tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{settings}_{bands}.png") + tc.metrics.plot_distributions(valid_z, results, f"plots/{name}_{bands}.png") return scores diff --git a/example/SimpleSOM.yaml b/example/SimpleSOM.yaml new file mode 100644 index 00000000..6f261f85 --- /dev/null +++ b/example/SimpleSOM.yaml @@ -0,0 +1,28 @@ +metrics: [SNR_3x2] +bands: riz +training_file: data/training.hdf5 +validation_file: data/validation_small.hdf5 +output_file: example/example_output.txt +# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo" +metrics_impl: firecrown + +run: + # This is a class name which will be looked up + SimpleSOM: + run_3: + # These settings are sent to the classifier + bins: 10 + som_dim: [151,151] + num_groups: 200 + num_threads: 32 + group_type: 'redshift' + data_threshold: [0,30] + sparse_frac: 0.05 + plots: True + plot_dir: 'plots' + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: False + errors: False + diff --git a/the_atlantics/example.py b/the_atlantics/example.py new file mode 100644 index 00000000..76a8a8fd --- /dev/null +++ b/the_atlantics/example.py @@ -0,0 +1,74 @@ +import numpy as np +import matplotlib.pyplot as plt +from snc import SnCalc + + +# Let's start with a "large" set of 12 samples +n_bins = 12 +z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins +sigma_z = 0.05 * (1+z_cent) +z_arr = np.linspace(0, 2, 1024) +# Total number of galaxies in each sample +n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2 + + +def pz_gau(z, zc, sz): + return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2) + + +# Create the N(z) arrays, which should integrate to the total number +# of galaxies in each sample +nzs = [ng * pz_gau(z_arr, zc, sz) + for ng, zc, sz in zip(n_gals, z_cent, sigma_z)] + +# Let's plot them +plt.figure() +for nz in nzs: + plt.plot(z_arr, nz) +plt.xlabel(r'$z$', fontsize=16) +plt.ylabel(r'$N(z)$', fontsize=16) + + +c = SnCalc(z_arr, nzs) +# Now let's do a dumb thing and bunch them up into +# 4, 3, 2 and 1 subsamples +assign4 = [np.array([0, 1, 2]), + np.array([3, 4, 5]), + np.array([6, 7, 8]), + np.array([9, 10, 11])] +assign3 = [np.array([0, 1, 2, 3]), + np.array([4, 5, 6, 7]), + np.array([8, 9, 10, 11])] +assign2 = [np.array([0, 1, 2, 3, 4, 5]), + np.array([6, 7, 8, 9, 10, 11])] +assign1 = [np.arange(n_bins)] + + +# OK, caclulate S/N for all of these groupings +snr_d4 = c.get_sn(assign4, full_output=True) +print('4 bins, S/N = %.1lf' % (snr_d4['snr'])) +snr_d3 = c.get_sn(assign3, full_output=True) +print('3 bins, S/N = %.1lf' % (snr_d3['snr'])) +snr_d2 = c.get_sn(assign2, full_output=True) +print('2 bins, S/N = %.1lf' % (snr_d2['snr'])) +snr_d1 = c.get_sn(assign1, full_output=True) +print('1 bins, S/N = %.1lf' % (snr_d1['snr'])) + + +# Note that the calculator also returns the power +# spectra, if you wanna look at them. +plt.figure() +for i in range(4): + for j in range(i, 4): + if i == j: + plt.plot(snr_d4['ls'], + snr_d4['cl'][:, i, j], + 'k-') + plt.plot(snr_d4['ls'], + snr_d4['nl'][0, i, j]*np.ones_like(snr_d4['ls']), 'k--') + else: + plt.plot(snr_d4['ls'], + snr_d4['cl'][:, i, j], + '-', c='#AAAAAA') +plt.loglog() +plt.show() diff --git a/the_atlantics/example_minimizer.py b/the_atlantics/example_minimizer.py new file mode 100644 index 00000000..8e168c7c --- /dev/null +++ b/the_atlantics/example_minimizer.py @@ -0,0 +1,86 @@ +import numpy as np +import matplotlib.pyplot as plt +from snc import SnCalc +from scipy.optimize import minimize + + +# Let's start with a "large" set of 12 samples +n_bins = 100 +z_cent = np.linspace(0.2, 1.4, n_bins+1)+0.5/n_bins +sigma_z = 0.05 * (1+z_cent) +z_arr = np.linspace(0, 2, 1024) +# Total number of galaxies in each sample +n_gals = (z_cent/0.5)**2*np.exp(-(z_cent/0.5)**1.5) * 10. * 1.8E4 * 60**2 + + +def pz_gau(z, zc, sz): + return np.exp(-0.5*((z-zc)/sz)**2)/np.sqrt(2*np.pi*sz**2) + + +# Create the N(z) arrays, which should integrate to the total number +# of galaxies in each sample. +nzs = [ng * pz_gau(z_arr, zc, sz) + for ng, zc, sz in zip(n_gals, z_cent, sigma_z)] + +# Let's plot them +plt.figure() +for nz in nzs: + plt.plot(z_arr, nz) +plt.xlabel(r'$z$', fontsize=16) +plt.ylabel(r'$N(z)$', fontsize=16) + +# Now initialize a S/N calculator for these initial groups. +c_wl = SnCalc(z_arr, nzs, use_clustering=False) +c_gc = SnCalc(z_arr, nzs, use_clustering=True) +print("Initializing WL") +c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bins) +print("Initializing GC") +c_gc.get_cl_matrix(fname_save='gc_nb%d.npz' % n_bins) + +# Let's focus on WL for now. +# OK, let's brute-force explore the total S/N as a function +# of the bin edge for a 2-bin scenario. +print("Exploring 2-bins") +zs = np.linspace(0.01, 1.5, 128) +sns_wl = np.array([c_wl.get_sn_from_edges(np.array([z])) for z in zs]) +sns_gc = np.array([c_gc.get_sn_from_edges(np.array([z])) for z in zs]) +plt.figure() +plt.plot(zs, sns_wl/np.amax(sns_wl), 'r-', label='Lensing') +plt.plot(zs, sns_gc/np.amax(sns_gc), 'k-', label='Clustering') +plt.xlabel('$z$') +plt.ylabel(r'$(S/N)/(S/N)_{\rm max}$') +plt.legend(loc='upper left', frameon=False) + +# Now let's do a 3-bin case (more coarsely-grained so we can do it quickly) +print("Exploring 3-bins") +zs = np.linspace(0.01, 1.5, 32) +sns = np.array([[c_wl.get_sn_from_edges(np.array([z1, z2])) for z1 in zs] + for z2 in zs]) + +plt.figure() +plt.imshow(sns, vmin=1440, extent=[0.01, 1.5, 0.01, 1.5], + origin='lower') +plt.xlabel('$z_1$') +plt.ylabel('$z_2$') +cb = plt.colorbar() +cb.set_label('$S/N$') + + +# Finally, let's write the function to minimize and optimize for a 4-bin case. +def minus_sn(edges, calc): + return -calc.get_sn_from_edges(edges) + + +print("Optimizing WL") +edges_0 = np.array([0.3, 0.6, 0.9]) +res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) +print("WL final edges: ", res.x) +print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x)) +print(" ") + +# That was for weak lensing. Let's do the same thing for clustering. +print("Optimizing GC") +res = minimize(minus_sn, edges_0, method='Powell', args=(c_gc,)) +print("GC final edges: ", res.x) +print("Maximum S/N: ", c_gc.get_sn_from_edges(res.x)) +plt.show() diff --git a/the_atlantics/optimizer.py b/the_atlantics/optimizer.py new file mode 100644 index 00000000..f46dd508 --- /dev/null +++ b/the_atlantics/optimizer.py @@ -0,0 +1,153 @@ +from argparse import ArgumentParser +import numpy as np +from snc import SnCalc, assign_params_default +from scipy.interpolate import interp1d +from scipy.optimize import minimize, brentq +from scipy.integrate import simps + + +parser = ArgumentParser() +parser.add_argument("--nz-file", default="nzs.npy", type=str, + help="Path to input file containing the N(z) of all initial groups") +parser.add_argument("--prefix-out", default='out', type=str, + help="Prefix for all output files") +parser.add_argument("--suffix-out", default='', type=str, + help="Suffix for all output files") +parser.add_argument("--n-bins", default=4, type=int, + help="Number of bins (excluding a possible trash bin") +parser.add_argument("--prob-in-bin", default=-1., type=float, + help="Minimum fraction of the N(z) that should be inside the bin." + "If <= 0, this will not be taken into account in the optimization.") +parser.add_argument("--prob-out-bin", default=-1., type=float, + help="Maximum fraction of the N(z) that should be inside other bins." + "If <= 0, this will not be taken into account in the optimization.") +parser.add_argument('--in-bin-free', default=False, action='store_true', + help='Free up in-bin probability (default: False)') +parser.add_argument('--use-clustering', default=False, action='store_true', + help='Use clustering (default: False)') +parser.add_argument('--use-3x2', default=False, action='store_true', + help='Use 3x2 (default: False)') +parser.add_argument('--random-start', default=False, action='store_true', + help='Use random starting points (default: False)') +parser.add_argument('--random-seed', default=1234, type=int, + help='Random seed') +o = parser.parse_args() + +nzs = np.load(o.nz_file) +# We should probably read z on input too +z_arr = np.linspace(0, 2, 124) +zz = (z_arr[1:]+z_arr[:-1])*0.5 +# Renormalize to use the same sample definitions as the +# official metric calculators. +fsky = 0.25 +ndens_arcmin = 20. +area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky +ng_tot = ndens_arcmin * area_arcmin +ng_each_small = simps(nzs, x=zz) +ng_tot_small = np.sum(ng_each_small) +ng_each = ng_each_small * ng_tot / ng_tot_small +nzs = nzs * ng_tot / ng_tot_small + +# Initialize calculator +sc = SnCalc(zz, nzs, fsky=fsky, use_clustering=o.use_clustering, use_3x2=o.use_3x2) +if o.use_3x2: + suffix = 'cl_3x2.npz' +else: + if o.use_clustering: + suffix = 'cl_gc.npz' + else: + suffix = 'cl_wl.npz' +sc.get_cl_matrix(fname_save=o.prefix_out + suffix) + +# Initial edge guess (defined as having equal number of galaxies) +if o.random_start: + np.random.seed(o.random_seed) + z_end = 1.5 + delta_z = z_end / (o.n_bins-1) + edges_0 = delta_z * (np.arange(o.n_bins-1) + np.random.rand(o.n_bins-1)) +else: + nz_tot = np.sum(nzs, axis=0) + cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) + cumul_f = interp1d(zz, cumulative_fraction, bounds_error=False, + fill_value=(0, 1)) + edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) + for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) + +if o.in_bin_free: + if o.random_start: + pin = np.random.rand() + else: + pin = 0.5 + edges_0 = np.concatenate((np.array([pin]), edges_0)) +print(edges_0) + +# Minimize +# Binning parameters + +params = assign_params_default.copy() +if o.prob_in_bin > 0: + params['p_inbin_thr'] = o.prob_in_bin + params['use_p_inbin'] = True +if o.prob_out_bin > 0: + params['p_outbin_thr'] = o.prob_out_bin + params['use_p_outbin'] = True +if o.in_bin_free: + params['use_p_inbin'] = True + params['p_inbin_thr'] = edges_0[0] + + +def minus_sn(edges, calc): + if o.in_bin_free: + if (edges[0] < 0) or (edges[0] > 1): + return 0. + params['p_inbin_thr'] = edges[0] + edges_use = edges[1:] + else: + edges_use = edges + return -calc.get_sn_from_edges(edges_use, assign_params=params) + + +# Actual optimization +res = minimize(minus_sn, edges_0, method='Powell', args=(sc,)) +if o.in_bin_free: + edges_1 = res.x[1:] + params['p_inbin_thr'] = res.x[0] +else: + edges_1 = res.x + +# Post-process: +# S/N +sn = sc.get_sn_from_edges(edges_1, assign_params=params) +# N(z)s of the final bins +nz_best = sc.get_nzs_from_edges(edges_1, assign_params=params) +# Group assignment +assign = sc.assign_from_edges(edges_1, assign_params=params, + get_ids=True) + + +def get_bin_name(i): + if i == -1: + return 'bin_trash' + else: + return 'bin_%d' % i + +print("Redshift edges: ", edges_1) +print("Optimal S/N: ", sn) +n_tot = simps(np.sum(nz_best, axis=0), + x=zz) + +d_out = {'sn': sn, + 'edges': edges_1} +if o.in_bin_free: + d_out['p_in'] = params['p_inbin_thr'] + print("P_in: ", params['p_inbin_thr']) +for (i, a), n in zip(assign, nz_best): + name = get_bin_name(i) + d_out[name+'_groups'] = a + d_out[name+'_nz'] = n + n_here = simps(n, x=zz) + f = n_here / n_tot + frac = "%.1lf" % (100 * f) + print("Bin %d, groups: " % i, a, frac) + +np.savez(o.prefix_out + o.suffix_out + '_bin_info.npz', **d_out) diff --git a/the_atlantics/snc.py b/the_atlantics/snc.py new file mode 100644 index 00000000..4c1d537e --- /dev/null +++ b/the_atlantics/snc.py @@ -0,0 +1,370 @@ +import numpy as np +import pyccl as ccl +from scipy.integrate import simps +import os + + +assign_params_default = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} + + +class SnCalc(object): + edges_large = 3. + + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100, + s_gamma=0.26, use_clustering=False, use_3x2=False, + integrator='qag_quad'):#spline'): + """ S/N calculator + Args: + z_arr (array_like): array of redshifts at which all the N(z)s are + sampled. They should be linearly spaced. + nz_list (list): list of arrays containing the redshift + distributions of all the initial groups (each array in the + list corresponds to one group). The arrays should contain the + redshift distribution sampled at `z_arr`, with its integral + over `z_arr` equal to the number of galaxies in that group. + fsky (float): sky fraction (for SNR calculation and for + transforming number of galaxies into number densities). + lmax (float): maximum ell. + d_ell (float): ell bandpower width. + s_gamma (float): rms ellipticity scatter (relevant if + `use_clustering=False`). + use_clustering (bool): if `True`, SNR will be computed for + clustering instead of lensing. + use_3x2 (bool): if `True`, SNR will be computed for 3x2pt. + integrator (string): CCL integration method. Either 'qag_quad' + or 'spline'. + """ + self.integrator = integrator + self.s_gamma = s_gamma + self.fsky = fsky + self.lmax = lmax + self.n_ell = n_ell + ell_edges = np.logspace(2, np.log10(lmax), n_ell+1) + self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1]) + self.d_ell = (ell_edges[1:]-ell_edges[:-1]) + self.n_samples = len(nz_list) + self.z_arr = z_arr + self.nz_list = nz_list + self.n_gals = np.array([simps(nz, x=self.z_arr) + for nz in self.nz_list]) + self.pz_list = self.nz_list / self.n_gals[:, None] + self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) + for nz in self.nz_list]) / self.n_gals + self.n_dens = self.n_gals / (4*np.pi*self.fsky) + self.cls = None + self.use_clustering = use_clustering + self.use_3x2 = use_3x2 + if use_3x2: + self.n_tracers = 2*self.n_samples + else: + self.n_tracers = self.n_samples + + def _bz_model(self, cosmo, z): + return 1./ccl.growth_factor(cosmo, 1./(1+z)) + + def get_cl_matrix(self, fname_save=None, recompute=False): + """ Computes matrix of power spectra between all the initial groups. + Args: + fname_save (string): if not None, the result will be saved to a + file by this name, and if present and `recompute=False`, the + matrix will be read from that file. + recompute (bool): if `True`, any previous calculations of the + C_ell matrix stored on file will be ignored, and the matrix + will be recomputed. + """ + if fname_save is not None: + if os.path.isfile(fname_save) and not recompute: + self.cls = np.load(fname_save)['cls'] + return + + # Cosmology + cosmo = ccl.Cosmology(Omega_c=0.25, + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) + + # Tracers + bz = self._bz_model(cosmo, self.z_arr) + if self.use_3x2: + trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + trs = trs_gc + trs_wl + else: + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + + # Cls + self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) + for i in range(self.n_tracers): + for j in range(i, self.n_tracers): + try: + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) + except: + cl = np.zeros(len(self.larr)) + self.cls[:, i, j] = cl + if j != i: + self.cls[:, j, i] = cl + + if fname_save is not None: + np.savez(fname_save, cls=self.cls) + + def get_resample_metadata(self, assign): + """ Transform an assignment list into a weight matrix and the + corresponding number densities. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + """ + n_resamples = len(assign) + weight_res = np.zeros([n_resamples, self.n_samples]) + n_dens_res = np.zeros(n_resamples) + for ia, a in enumerate(assign): + ndens = self.n_dens[a] + ndens_tot = np.sum(ndens) + n_dens_res[ia] = ndens_tot + weight_res[ia, a] = ndens / ndens_tot + return weight_res, n_dens_res + + def _get_cl_resamples(self, weights): + """ Gets C_ell matrix for resampled groups from weights matrix. + """ + if self.cls is None: + self.get_cl_matrix() + if self.use_3x2: + w_use = np.tile(weights, (2, 2)) + else: + w_use = weights + return np.sum(np.sum(w_use[None, None, :, :] * + self.cls[:, :, None, :], + axis=-1)[:, None, :, :] * + w_use[None, :, :, None], + axis=-2) + + def _get_nl_resamples(self, n_dens): + """ Gets noise contribution to C_ell matrix for resampled groups + from list of number densities. + """ + if self.use_3x2: + n_diag = np.array(list(1./n_dens) + + list(self.s_gamma**2/n_dens)) + return np.diag(n_diag)[None, :, :] + else: + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_max(self, full_output=False): + """ Compute maximum signal-to-noise ratio given the input + groups + Args: + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + weights = np.eye(self.n_samples) + n_dens = self.n_dens + return self.get_sn_wn(weights, n_dens, full_output=full_output) + + def get_kltrans(self): + """ Compute KL decomposition. + + Returns: + Fisher matrix element for each of the KL modes. + """ + nl = self._get_nl_resamples(self.n_dens) + nij = nl*np.ones(self.n_ell)[:, None, None] + cij = self.cls + nij + sij = cij - nij + inv_nij = np.linalg.inv(nij) + metric = inv_nij + + def change_basis(c, m, ev): + return np.array([np.diag(np.dot(ev[i].T, + np.dot(m[i], + np.dot(c[i], + np.dot(m[i], + ev[i]))))) + for i in range(self.n_ell)]) + + def diagonalize(c, m): + im = np.linalg.inv(m) + ll = np.linalg.cholesky(m) + ill = np.linalg.cholesky(im) + cl = np.array([np.dot(np.transpose(ll[i]), + np.dot(c[i], ll[i])) + for i in range(self.n_ell)]) + c_p, v = np.linalg.eigh(cl) + ev = np.array([np.dot(np.transpose(ill[i]), v[i]) + for i in range(self.n_ell)]) + # iden = change_basis(im, m, ev) + return ev, c_p + + e_v, c_p = diagonalize(cij, metric) + s_p = change_basis(sij, metric, e_v) + nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5)) + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + isort = np.argsort(-np.sum(fish_kl, axis=0)) + e_o = e_v[:, :, isort] + # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :]) + # for l in range(self.n_ell)]) + c_p = change_basis(cij, metric, e_o) + s_p = change_basis(sij, metric, e_o) + + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + fish_permode = np.sum(fish_kl, axis=0) + fish_cumul = np.cumsum(fish_permode) + return fish_kl, fish_permode, fish_cumul + + def get_sn_wn(self, weights, n_dens, full_output=False): + """ Compute signal-to-noise ratio from weights matrix and number + densities of the resampled groups. + Args: + weights (array_like): weights matrix of shape + `(N_resample, N_initial)`, where `N_resample` is the + number of new groups, and `N_initial` is the original + number of groups. Each entry corresponds to the weight + which with a given initial group enters the new set of + groups. + n_dens (array_like): number density (in sterad^-1) of the + new groups. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + sl = self._get_cl_resamples(weights) + nl = self._get_nl_resamples(n_dens) + cl = sl + nl + icl = np.linalg.inv(cl) + sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) + sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], + axis=2) + trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) + snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) * + self.d_ell * self.fsky)) + + if full_output: + dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} + return dret + else: + return snr + + def get_sn(self, assign, full_output=False): + """ Compute signal-to-noise ratio from a bin assignment list. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + w, ndens = self.get_resample_metadata(assign) + return self.get_sn_wn(w, ndens, + full_output=full_output) + + def check_edges(self, edges): + """ Returns `True` if there's something wrong with the edges. + """ + swapped = False + if np.ndim(edges) > 0: + swapped = np.any(np.diff(edges) < 0) + return np.any(edges < 0) or \ + np.any(edges > self.edges_large) or \ + swapped + + def get_nzs_from_edges(self, edges, assign_params=assign_params_default): + assign = self.assign_from_edges(edges, assign_params=assign_params) + nzs = np.array([np.sum(self.nz_list[a, :], axis=0) + for a in assign]) + return nzs + + def assign_from_edges(self, edges, assign_params=assign_params_default, + get_ids=False): + """ Get assignment list from edges and assign parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + get_ids (bool): if `True`, output assignment arrays will + be accompanied by the associated bin id (with -1 for the + trash bin). + Returns: + List of assignment arrays ready to be used in e.g. `get_sn`. + """ + edges = np.atleast_1d(edges) + nbins = len(edges) + 1 + # Bin IDs based on mean z + ids = np.digitize(self.z_means, bins=edges) + if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) + # Calculate probabilities in each bin + zgroups = np.digitize(self.z_arr, bins=edges) + masks = [zgroups == i for i in range(nbins)] + if assign_params['use_p_outbin']: + # Matrix of probabilities in each bin + pzs = np.array([[integrate_safe(pz, self.z_arr, m) + for m in masks] + for pz in self.pz_list]) + pzd = np.array([pzs[j, ids[j]] + for j in range(self.n_samples)]) + else: + # Overlaps in own bin + pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) + for pz, i in zip(self.pz_list, ids)]) + # Throw away based on in-bin probability + if assign_params['use_p_inbin']: + ids[pzd < assign_params['p_inbin_thr']] = -1 + # Throw away based on off-bin probability + if assign_params['use_p_outbin']: + ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 + for p in pzs])] = -1 + # Assignment list + if get_ids: + return [(i, np.where(ids == i)[0]) for i in np.unique(ids)] + else: + return [np.where(ids == i)[0] for i in np.unique(ids)] + + def get_sn_from_edges(self, edges, full_output=False, + assign_params=assign_params_default): + """ Compute signal-to-noise ratio from a set of edges and assignment + parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + if self.check_edges(edges): + return 0 + assign = self.assign_from_edges(edges, assign_params=assign_params) + return self.get_sn(assign, full_output=full_output) diff --git a/tomo_challenge/classifiers/ComplexSOM.py b/tomo_challenge/classifiers/ComplexSOM.py new file mode 100644 index 00000000..f43cdebf --- /dev/null +++ b/tomo_challenge/classifiers/ComplexSOM.py @@ -0,0 +1,470 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +import os +from .base import Tomographer +import numpy as np +import rpy2 +import rpy2.robjects as ro +import rpy2.robjects.packages as rpack +from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector +import pandas as pd +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter +from ..snc import SnCalc +from scipy.interpolate import interp1d +from scipy.optimize import minimize, brentq +from scipy.integrate import simps +import pickle + +#Check that all the needed packages are installed +# R package nameo +packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats') +base=ro.packages.importr("base") +utils=ro.packages.importr("utils") +stats=ro.packages.importr("stats") +gr=ro.packages.importr("graphics") +dev=ro.packages.importr("grDevices") +utils.chooseCRANmirror(ind=1) +# Selectively install what needs to be installed. +names_to_install = [x for x in packnames if not rpack.isinstalled(x)] +if len(names_to_install) > 0: + utils.install_packages(StrVector(names_to_install)) +base.Sys_setenv(TAR=base.system("which tar",intern=True)) +devtools=ro.packages.importr("devtools") +#devtools.install_github("AngusWright/helpRfuncs") +#devtools.install_github("AngusWright/kohonen/kohonen") +kohonen=ro.packages.importr("kohonen") + +class ComplexSOM(Tomographer): + """ Complex SOM Classifier with SNR Optimisation """ + + # valid parameter -- see below + valid_options = ['bins','som_dim','num_groups','num_threads', + 'group_type','data_threshold','sparse_frac','plots', + 'plot_dir','metric','use_inbin','use_outbin','testing', + 'whiten','redshift_propset'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + 'som_dim' - dimensions of the SOM + 'num_groups' - number of SOM groups/associations + 'num_threads' - number of threads to use + 'sparse_frac' - the sparse-sampling fraction used for training + 'group_type' - do we want grouping by colour or redshift + 'data_threshold' - number of threads to use + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the SOM and outputs the resulting bins + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + print("Initialising") + #Number of tomographic bins + n_bin = self.opt['bins'] + #Dimensions of the SOM + som_dim = self.opt['som_dim'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] + #Number of threads + num_threads = self.opt['num_threads'] + #Sparse Frac + sparse_frac = self.opt['sparse_frac'] + #Data Threshold + data_threshold = self.opt['data_threshold'] + #Metric + metric = self.opt['metric'] + #Flag bad bins with inbin probability + use_inbin = self.opt['use_inbin'] + #Flag bad bins with outbin probability + use_outbin = self.opt['use_outbin'] + #Plots + plots = self.opt['plots'] + #redshift properties to use in grouping + redshift_propset = self.opt['redshift_propset'] + #Whiten the redshift data when grouping? + whiten = self.opt['whiten'] + #Plot Output Directory + plot_dir = self.opt['plot_dir'] + #Group Type + group_type = self.opt['group_type'] + #Check that the group type is a valid choice + if group_type != 'colour' and group_type != 'redshift': + group_type = 'redshift' + #Define the assign_params variable, used in bin optimisation + assign_params = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': use_inbin, + 'use_p_outbin': use_outbin} + + #Define the redshift summary statistics (used for making groups in the 'redshift' case + if redshift_propset==1: + property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")) + elif redshift_propset==2: + property_labels = StrVector(("mean_z_true","med_z_true","mad_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)", + "mad(data$redshift_true)")) + elif redshift_propset==3: + property_labels = StrVector(("quan_05","quan_16","quan_50","quan_84","quan_95")) + property_expressions = ("quantile(data$redshift_true,probs=c(5,16,50,84,95)/100)") + property_labels = StrVector(("quan_05","quan_16","quan_84","quan_95")) + property_expressions = ("quantile(data$redshift_true,probs=c(5,16,84,95)/100)") + else: + property_labels = StrVector(("mean_z_true","med_z_true","sd_z_true","mad_z_true","iqr_z_true")) + property_expressions = StrVector(("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))")) + + #Define the SOM variables + if self.bands == 'riz': + #riz bands + if self.opt['sizes'] == True: + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)","mcal_T") + else: + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)") + elif self.bands == 'griz': + #griz bands + if self.opt['sizes'] == True: + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)","mcal_T") + else: + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'grizy': + #grizy bands + if self.opt['sizes'] == True: + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)","mcal_T") + else: + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + elif self.bands == 'ugriz': + #ugrizy bands + if self.opt['sizes'] == True: + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","mcal_T") + else: + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'ugrizy': + #ugrizy bands + if self.opt['sizes'] == True: + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)","mcal_T") + else: + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + + #Define the output SOM name + if self.opt['sizes'] == True: + som_outname = f"SOM_{som_dim}_{self.bands}_withsizelin.pkl" + else: + som_outname = f"SOM_{som_dim}_{self.bands}.pkl" + + #If Testing, load the pretrained SOM and matching training catalogue + if self.opt['testing'] == True: + #Use the default training set + with open(f"test_{som_outname}", 'rb') as f: + som = pickle.load(f) + print(base.length(som.rx2['unit.classif'])) + with open("training_testcat.pkl", 'rb') as f: + train_df = pickle.load(f) + print(utils.str(train_df)) + else: + #Use the training set provided + print("Preparing the data") + training_data = pd.DataFrame.from_dict(training_data) + #Add the redshift variable to the train data + print("Adding redshift info to training data") + training_data['redshift_true'] = training_z + + if sparse_frac < 1: + print("Sparse Sampling the training data") + cut = np.random.uniform(0, 1, training_z.size) < sparse_frac + training_data = training_data[cut] + training_z = training_z[cut] + + #Construct the training data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + train_df = ro.conversion.py2rpy(training_data) + + #with open("training_testcat.pkl", 'wb') as f: + # pickle.dump(train_df, f) + + #Construct or Load the SOM + if not os.path.exists(som_outname): + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + print(base.length(som.rx2['unit.classif'])) + #Output the SOM + with open(som_outname, 'wb') as f: + pickle.dump(som, f) + else: + print("Loading the pretrained SOM") + with open(som_outname, 'rb') as f: + som = pickle.load(f) + som.rx2['unit.classif']=FloatVector([]) + + #If grouping by redshift, construct the cell redshift statistics + if group_type == 'redshift' or plots == True: + print("Constructing cell-based redshift properties") + #Construct the Nz properties per SOM cell + cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=property_expressions,expr_label=property_labels,returnMatrix=True) + som=cell_prop.rx2('som') + som.rx2['clust.classif']=FloatVector([]) + print("Constructing redshift-based hierarchical cluster tree") + #Cluster the SOM cells into num_groups groups + if whiten==True: + props = kohonen.kohwhiten(cell_prop.rx2['property'],train_expr=base.colnames(cell_prop.rx2['property']), + data_missing='NA',data_threshold=FloatVector([0,12])) + props = props.rx2("data.white") + props.rx[base.which(base.is_na(props))] = -10 + else: + props = cell_prop.rx2['property'] + props.rx[base.which(base.is_na(props))] = -1 + + print(base.summary(props)) + + if group_type=='redshift': + print("Groups will be constructed by redshift property") + hclust=stats.hclust(stats.dist(props)) + cell_group=stats.cutree(hclust,k=num_groups) + #Assign the cell groups to the SOM structure + som.rx2['hclust']=hclust + som.rx2['cell_clust']=cell_group + else: + print("Groups will be constructed by colour property") + + #Construct the Nz properties per SOM group + print("Constructing group-based redshift properties") + group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=property_expressions,expr_label=property_labels, + n_cluster_bins=num_groups,returnMatrix=True) + + #extract the training som (just for convenience) + train_som = group_prop.rx2('som') + + if plots == True and redshift_propset==1: + #Make the diagnostic plots + props = group_prop.rx2('property') + cprops = cell_prop.rx2('property') + print(base.colnames(props)) + print("Constructing SOM diagnostic figures") + #Initialise the plot device + dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in') + #Paint the SOM by the training properties + for i in range(len(expressions)): + gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3])) + #Plot the standard diagnostics: + #Count + gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3) + #Quality + gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3) + #UMatrix + gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3) + #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code + + #Paint by the redshift diagnostics: + #mean redshift + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=cprops.rx(True,base.which(cprops.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Cell 2sigma IQR',zlim=FloatVector([0,0.4])) + #mean redshift + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group 2sigma IQR',zlim=FloatVector([0,0.4])) + ##N group + #gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, + # type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') + + #Close the plot device + dev.dev_off() + + print("Constructing the per-group Nz") + #Get the group assignments for the training data + train_group = np.array(train_som.rx2['clust.classif']) + tab=base.table(FactorVector(train_som.rx2['clust.classif'],levels=base.seq(num_groups))) + print(tab) + print(np.arange(num_groups)[np.array(tab)==0]) + #Construct the Nz z-axis + z_arr = np.linspace(0, 2, 124) + z_cen = z_arr[0:-1]+(z_arr[1]-z_arr[0])/2. + #Construct the per-group Nz + nzs = [(np.histogram(training_z[train_group == group+1],z_arr)[0]) + for group in np.arange(num_groups)[np.array(tab)!=0]] + #np.save('plots/nzs.npy', nzs) + + #Update the fsky + # Renormalize to use the same sample definitions as the + # official metric calculators. + fsky = 0.25 + ndens_arcmin = 20. + area_arcmin = (180*60/np.pi)**2*4*np.pi*fsky + ng_tot_goal = ndens_arcmin * area_arcmin + ng_tot_curr = np.sum(np.array(tab)) + nzs = np.array([nz * ng_tot_goal / ng_tot_curr for nz in nzs]) + #np.save('plots/nzs_norm.npy', nzs) + + # Now initialize a S/N calculator for these initial groups. + os.system('rm wl_nb%d.npz' % n_bin) + if metric == 'SNR_ww': + c_wl = SnCalc(z_cen, nzs, use_clustering=False, fsky=fsky) + else: + c_wl = SnCalc(z_cen, nzs, use_clustering=True,fsky=fsky) + print("Initializing Cls") + c_wl.get_cl_matrix(fname_save='wl_nb%d.npz' % n_bin) + + # Initial edge guess + nz_tot = np.sum(nzs, axis=0) + cumulative_fraction = np.cumsum(nz_tot) / np.sum(nz_tot) + cumul_f = interp1d(z_cen, cumulative_fraction, bounds_error=False, + fill_value=(0, 1)) + edges_0 = np.array([brentq(lambda z : cumul_f(z) - q, 0, 2) + for q in (np.arange(o.n_bins-1)+1.)/o.n_bins]) + + # Finally, let's write the function to minimize and optimize for a 4-bin case. + def minus_sn(edges, calc): + return -calc.get_sn_from_edges(edges,assign_params=assign_params) + + print("Optimizing WL") + res = minimize(minus_sn, edges_0, method='Powell', args=(c_wl,)) + print("Final edges: ", res.x) + print("Maximum S/N: ", c_wl.get_sn_from_edges(res.x, + assign_params=assign_params)) + print(" ") + + #Construct the per-group Nz + print("Outputting trained SOM and redshift ordering of SOM groups") + groups_in_tomo = c_wl.assign_from_edges(res.x, + assign_params=assign_params, + get_ids=True) + group_bins = np.zeros(num_groups) + for bin_no, groups in groups_in_tomo: + print(bin_no, groups) + group_bins[groups] = bin_no + + print("Finished") + self.train_som = train_som + self.group_bins = group_bins + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + #Number of tomographic bins + n_bin = self.opt['bins'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] + + print("Preparing the data") + data = pd.DataFrame.from_dict(data) + + #Construct the validation data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + data_df = ro.conversion.py2rpy(data) + + print("Parsing the validation data into the SOM groupings") + #Generate the validation associations/groups + group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, + expression="nrow(data)",expr_label="N", + n_cluster_bins=num_groups,n_cores=self.opt['num_threads'],returnMatrix=True) + + #extract the validation som (just for convenience) + valid_som = group_prop.rx2('som') + + #Assign the sources, by group, to tomographic bins + print("Output source tomographic bin assignments") + valid_bin = base.unsplit(IntVector(self.group_bins), + FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(num_groups)), + drop=False) + valid_bin = np.array(valid_bin) + + return valid_bin + diff --git a/tomo_challenge/classifiers/SimpleSOM.py b/tomo_challenge/classifiers/SimpleSOM.py new file mode 100644 index 00000000..43d7ed98 --- /dev/null +++ b/tomo_challenge/classifiers/SimpleSOM.py @@ -0,0 +1,323 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import rpy2 +import rpy2.robjects as ro +import rpy2.robjects.packages as rpack +from rpy2.robjects.vectors import StrVector, IntVector, DataFrame, FloatVector, FactorVector +import pandas as pd +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter + +#Check that all the needed packages are installed +# R package nameo +packnames = ('data.table','itertools','foreach','doParallel','RColorBrewer','devtools','matrixStats') +base=ro.packages.importr("base") +utils=ro.packages.importr("utils") +stats=ro.packages.importr("stats") +gr=ro.packages.importr("graphics") +dev=ro.packages.importr("grDevices") +utils.chooseCRANmirror(ind=1) +# Selectively install what needs to be installed. +names_to_install = [x for x in packnames if not rpack.isinstalled(x)] +if len(names_to_install) > 0: + utils.install_packages(StrVector(names_to_install)) +base.Sys_setenv(TAR=base.system("which tar",intern=True)) +devtools=ro.packages.importr("devtools") +#devtools.install_github("AngusWright/helpRfuncs") +#devtools.install_github("AngusWright/kohonen/kohonen") +kohonen=ro.packages.importr("kohonen") + +class SimpleSOM(Tomographer): + """ Simplistic SOM Classifier """ + + # valid parameter -- see below + valid_options = ['bins','som_dim','num_groups','num_threads', + 'group_type','data_threshold','sparse_frac','plots','plot_dir'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + 'som_dim' - dimensions of the SOM + 'num_groups' - number of SOM groups/associations + 'num_threads' - number of threads to use + 'sparse_frac' - the sparse-sampling fraction used for training + 'group_type' - do we want grouping by colour or redshift + 'data_threshold' - number of threads to use + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the SOM and outputs the resulting bins + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + print("Initialising") + #Number of tomographic bins + n_bin = self.opt['bins'] + #Dimensions of the SOM + som_dim = self.opt['som_dim'] + #Number of discrete SOM groups + num_groups = self.opt['num_groups'] + #Number of threads + num_threads = self.opt['num_threads'] + #Sparse Frac + sparse_frac = self.opt['sparse_frac'] + #Data Threshold + data_threshold = self.opt['data_threshold'] + #Plots + plots = self.opt['plots'] + #Plot Output Directory + plot_dir = self.opt['plot_dir'] + #Group Type + group_type = self.opt['group_type'] + #Check that the group type is a valid choice + if group_type != 'colour' and group_type != 'redshift': + group_type = 'redshift' + + #Define the redshift summary statistics (used for making groups in the 'redshift' case + property_labels = ("mean_z_true","med_z_true","sd_z_true","mad_z_true","N","iqr_z_true") + property_expressions = ("mean(data$redshift_true)","median(data$redshift_true)","sd(data$redshift_true)", + "mad(data$redshift_true)","nrow(data)", + "diff(quantile(data$redshift_true,probs=pnorm(c(-2,2))))") + #Define the SOM variables + if self.bands == 'riz': + #riz bands + #expressions = ("r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","r_mag-i_mag-(i_mag-z_mag)") + expressions = ("r-i","r-z","i-z", + "z","r-i-(i-z)") + elif self.bands == 'griz': + #griz bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'grizy': + #grizy bands + #expressions = ("g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + elif self.bands == 'ugriz': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","r_mag-i_mag","r_mag-z_mag","i_mag-z_mag", + # "z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)") + expressions = ("u-g","u-r","u-i","u-z","g-r","g-i", + "g-z","r-i","r-z","i-z", + "z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)") + elif self.bands == 'ugrizy': + #ugrizy bands + #expressions = ("u_mag-g_mag","u_mag-r_mag","u_mag-i_mag","u_mag-z_mag","u_mag-y_mag","g_mag-r_mag","g_mag-i_mag", + # "g_mag-z_mag","g_mag-y_mag","r_mag-i_mag","r_mag-z_mag","r_mag-y_mag","i_mag-z_mag","i_mag-y_mag", + # "z_mag-y_mag","z_mag","u_mag-g_mag-(g_mag-r_mag)","g_mag-r_mag-(r_mag-i_mag)", + # "r_mag-i_mag-(i_mag-z_mag)","i_mag-z_mag-(z_mag-y_mag)") + expressions = ("u-g","u-r","u-i","u-z","u-y","g-r","g-i", + "g-z","g-y","r-i","r-z","r-y","i-z","i-y", + "z-y","z","u-g-(g-r)","g-r-(r-i)", + "r-i-(i-z)","i-z-(z-y)") + + print("Preparing the data") + training_data = pd.DataFrame.from_dict(training_data) + #Add the redshift variable to the train data + print("Adding redshift info to training data") + training_data['redshift_true'] = training_z + + if sparse_frac < 1: + print("Sparse Sampling the training data") + cut = np.random.uniform(0, 1, training_z.size) < sparse_frac + training_data = training_data[cut] + + #Construct the training data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + train_df = ro.conversion.py2rpy(training_data) + + print("Training the SOM using R kohtrain") + #Train the SOM using R kohtrain + som=kohonen.kohtrain(data=train_df,som_dim=IntVector(som_dim),max_na_frac=0,data_threshold=FloatVector(data_threshold), + n_cores=num_threads,train_expr=StrVector(expressions),train_sparse=False,sparse_frac=sparse_frac) + + #If grouping by redshift, construct the cell redshift statistics + if group_type == 'redshift' or plots == True: + print("Constructing cell-based redshift properties") + #Construct the Nz properties per SOM cell + cell_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels)) + print("Constructing redshift-based hierarchical cluster tree") + #Cluster the SOM cells into num_groups groups + hclust=stats.hclust(stats.dist(cell_prop.rx2['property'])) + cell_group=stats.cutree(hclust,k=num_groups) + + #Assign the cell groups to the SOM structure + som.rx2['hclust']=hclust + som.rx2['cell_clust']=cell_group + + #Construct the Nz properties per SOM group + print("Constructing group-based redshift properties") + group_prop=kohonen.generate_kohgroup_property(som=som,data=train_df, + expression=StrVector(property_expressions),expr_label=StrVector(property_labels), + n_cluster_bins=num_groups) + + #extract the training som (just for convenience) + train_som = group_prop.rx2('som') + + if plots == True: + #Make the diagnostic plots + props = group_prop.rx2('property') + print(base.colnames(props)) + print("Constructing SOM diagnostic figures") + #Initialise the plot device + dev.png(file=f'{plot_dir}/SOMfig_%02d.png',height=5,width=5,res=220,unit='in') + #Paint the SOM by the training properties + for i in range(len(expressions)): + gr.plot(train_som,property=i+1,shape='straight',heatkeywidth=som_dim[0]/20,ncolors=1e3,zlim=FloatVector([-3,3])) + #Plot the standard diagnostics: + #Count + gr.plot(train_som,type='count',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,ncolors=1e3) + #Quality + gr.plot(train_som,type='quality',shape='straight',heatkeywidth=som_dim[0]/20,zlim=FloatVector([0,0.5]),ncolors=1e3) + #UMatrix + gr.plot(train_som,type='dist',shape='straight',heatkeywidth=som_dim[0]/20,zlog=True,zlim=FloatVector([0,0.5]),ncolors=1e3) + #Sometimes these figures need customised limits; the "try" stops bad plots from stopping the code + + #Paint by the redshift diagnostics: + #mean redshift + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='mean_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group mean redshift',zlim=FloatVector([0,1.8])) + #redshift std + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='sd_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='Group redshift stdev',zlim=FloatVector([0,0.2])) + #2sigma redshift IQR + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='iqr_z_true')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='group 2sigma IQR',zlim=FloatVector([0,0.4])) + #N group + gr.plot(train_som,property=props.rx(True,base.which(props.colnames.ro=='N')),ncolors=1e3,zlog=False, + type='property',shape='straight',heatkeywidth=som_dim[0]/20,main='N group') + + #Close the plot device + dev.dev_off() + + + + print("Outputting trained SOM and redshift ordering of SOM groups") + #Extract the mean-z per group + group_z = group_prop.rx2('property').rx(True, + base.which(group_prop.rx2('property').colnames.ro=='mean_z_true')) + #Order the groups by mean z + z_order = base.order(group_z) + + print("Finished") + self.train_som = train_som + self.group_z = group_z + self.z_order = z_order + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + #Number of tomographic bins + n_bin = self.opt['bins'] + + print("Preparing the data") + data = pd.DataFrame.from_dict(data) + + #Construct the validation data frame (just a python-to-R data conversion) + print("Converting the data to R format") + with localconverter(ro.default_converter + pandas2ri.converter): + #train_df = ro.conversion.py2rpy(train[['u_mag','g_mag','r_mag','i_mag','z_mag','y_mag']]) + data_df = ro.conversion.py2rpy(data) + + print("Parsing the validation data into the SOM groupings") + #Generate the validation associations/groups + group_prop=kohonen.generate_kohgroup_property(som=self.train_som,data=data_df, + expression="nrow(data)",expr_label="N", + n_cluster_bins=self.opt['num_groups'],n_cores=self.opt['num_threads']) + + #Calculate the cumulative count + print("Generate cumulative source counts a.f.o. group mean z") + zcumsum=base.cumsum(group_prop.rx2('property').rx(self.z_order)) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + print("Assign the groups to tomographic bins") + p = np.linspace(0, 100, n_bin + 1) + n_edges = np.percentile(zcumsum, p) + + # Now put the groups into redshift bins. + group_bins = FloatVector(base.cut(FloatVector(zcumsum),FloatVector(n_edges), + include=True)).rx(base.order(self.z_order)) + + #extract the validation som (just for convenience) + valid_som = group_prop.rx2('som') + + #Assign the sources, by group, to tomographic bins + print("Output source tomographic bin assignments") + valid_bin = base.unsplit(group_bins,FactorVector(valid_som.rx2['clust.classif'], + levels=base.seq(self.opt['num_groups'])),drop=False) + #valid_bin = valid_som.rx2['clust.classif'] + valid_bin = np.array(valid_bin)-1 + + return valid_bin + diff --git a/tomo_challenge/data.py b/tomo_challenge/data.py index 03eadc5c..5b8a7af4 100644 --- a/tomo_challenge/data.py +++ b/tomo_challenge/data.py @@ -124,6 +124,12 @@ def add_colors(data, bands, errors=False): if errors: data[f'{b}{c}_err'] = np.sqrt(data[f'{b}_err']**2 + data[f'{c}_err']**2) +def add_size(data, filename): + + with h5py.File(filename, 'r') as f: + # load all bands + data['mcal_T'] = f[f'mcal_T'][:] + def dict_to_array(data, bands, errors=False, colors=False): nobj = data[bands[0]].size @@ -165,12 +171,15 @@ def colors_for_bands(bands): -def load_data(filename, bands, colors=False, errors=False, array=False): +def load_data(filename, bands, colors=False, errors=False, size=False, array=False): data = load_mags(filename, bands, errors=errors) if colors: add_colors(data, bands, errors=errors) + if size: + add_size(data, filename) + if array: data = dict_to_array(data, bands, errors=errors, colors=colors) diff --git a/tomo_challenge/snc.py b/tomo_challenge/snc.py new file mode 100644 index 00000000..4c1d537e --- /dev/null +++ b/tomo_challenge/snc.py @@ -0,0 +1,370 @@ +import numpy as np +import pyccl as ccl +from scipy.integrate import simps +import os + + +assign_params_default = {'p_inbin_thr': 0.5, + 'p_outbin_thr': 0.2, + 'use_p_inbin': False, + 'use_p_outbin': False} + + +class SnCalc(object): + edges_large = 3. + + def __init__(self, z_arr, nz_list, fsky=0.4, lmax=2000, n_ell=100, + s_gamma=0.26, use_clustering=False, use_3x2=False, + integrator='qag_quad'):#spline'): + """ S/N calculator + Args: + z_arr (array_like): array of redshifts at which all the N(z)s are + sampled. They should be linearly spaced. + nz_list (list): list of arrays containing the redshift + distributions of all the initial groups (each array in the + list corresponds to one group). The arrays should contain the + redshift distribution sampled at `z_arr`, with its integral + over `z_arr` equal to the number of galaxies in that group. + fsky (float): sky fraction (for SNR calculation and for + transforming number of galaxies into number densities). + lmax (float): maximum ell. + d_ell (float): ell bandpower width. + s_gamma (float): rms ellipticity scatter (relevant if + `use_clustering=False`). + use_clustering (bool): if `True`, SNR will be computed for + clustering instead of lensing. + use_3x2 (bool): if `True`, SNR will be computed for 3x2pt. + integrator (string): CCL integration method. Either 'qag_quad' + or 'spline'. + """ + self.integrator = integrator + self.s_gamma = s_gamma + self.fsky = fsky + self.lmax = lmax + self.n_ell = n_ell + ell_edges = np.logspace(2, np.log10(lmax), n_ell+1) + self.larr = 0.5*(ell_edges[1:]+ell_edges[:-1]) + self.d_ell = (ell_edges[1:]-ell_edges[:-1]) + self.n_samples = len(nz_list) + self.z_arr = z_arr + self.nz_list = nz_list + self.n_gals = np.array([simps(nz, x=self.z_arr) + for nz in self.nz_list]) + self.pz_list = self.nz_list / self.n_gals[:, None] + self.z_means = np.array([simps(self.z_arr*nz, x=self.z_arr) + for nz in self.nz_list]) / self.n_gals + self.n_dens = self.n_gals / (4*np.pi*self.fsky) + self.cls = None + self.use_clustering = use_clustering + self.use_3x2 = use_3x2 + if use_3x2: + self.n_tracers = 2*self.n_samples + else: + self.n_tracers = self.n_samples + + def _bz_model(self, cosmo, z): + return 1./ccl.growth_factor(cosmo, 1./(1+z)) + + def get_cl_matrix(self, fname_save=None, recompute=False): + """ Computes matrix of power spectra between all the initial groups. + Args: + fname_save (string): if not None, the result will be saved to a + file by this name, and if present and `recompute=False`, the + matrix will be read from that file. + recompute (bool): if `True`, any previous calculations of the + C_ell matrix stored on file will be ignored, and the matrix + will be recomputed. + """ + if fname_save is not None: + if os.path.isfile(fname_save) and not recompute: + self.cls = np.load(fname_save)['cls'] + return + + # Cosmology + cosmo = ccl.Cosmology(Omega_c=0.25, + Omega_b=0.05, + h=0.67, n_s=0.96, + sigma8=0.81) + + # Tracers + bz = self._bz_model(cosmo, self.z_arr) + if self.use_3x2: + trs_gc = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + trs_wl = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + trs = trs_gc + trs_wl + else: + if self.use_clustering: + trs = [ccl.NumberCountsTracer(cosmo, False, (self.z_arr, nz), + bias=(self.z_arr, bz)) + for nz in self.nz_list] + else: + trs = [ccl.WeakLensingTracer(cosmo, (self.z_arr, nz)) + for nz in self.nz_list] + + # Cls + self.cls = np.zeros([self.n_ell, self.n_tracers, self.n_tracers]) + for i in range(self.n_tracers): + for j in range(i, self.n_tracers): + try: + cl = ccl.angular_cl(cosmo, trs[i], trs[j], self.larr, + limber_integration_method=self.integrator) + except: + cl = np.zeros(len(self.larr)) + self.cls[:, i, j] = cl + if j != i: + self.cls[:, j, i] = cl + + if fname_save is not None: + np.savez(fname_save, cls=self.cls) + + def get_resample_metadata(self, assign): + """ Transform an assignment list into a weight matrix and the + corresponding number densities. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + """ + n_resamples = len(assign) + weight_res = np.zeros([n_resamples, self.n_samples]) + n_dens_res = np.zeros(n_resamples) + for ia, a in enumerate(assign): + ndens = self.n_dens[a] + ndens_tot = np.sum(ndens) + n_dens_res[ia] = ndens_tot + weight_res[ia, a] = ndens / ndens_tot + return weight_res, n_dens_res + + def _get_cl_resamples(self, weights): + """ Gets C_ell matrix for resampled groups from weights matrix. + """ + if self.cls is None: + self.get_cl_matrix() + if self.use_3x2: + w_use = np.tile(weights, (2, 2)) + else: + w_use = weights + return np.sum(np.sum(w_use[None, None, :, :] * + self.cls[:, :, None, :], + axis=-1)[:, None, :, :] * + w_use[None, :, :, None], + axis=-2) + + def _get_nl_resamples(self, n_dens): + """ Gets noise contribution to C_ell matrix for resampled groups + from list of number densities. + """ + if self.use_3x2: + n_diag = np.array(list(1./n_dens) + + list(self.s_gamma**2/n_dens)) + return np.diag(n_diag)[None, :, :] + else: + if self.use_clustering: + return np.diag(1./n_dens)[None, :, :] + else: + return np.diag(self.s_gamma**2/n_dens)[None, :, :] + + def get_sn_max(self, full_output=False): + """ Compute maximum signal-to-noise ratio given the input + groups + Args: + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + weights = np.eye(self.n_samples) + n_dens = self.n_dens + return self.get_sn_wn(weights, n_dens, full_output=full_output) + + def get_kltrans(self): + """ Compute KL decomposition. + + Returns: + Fisher matrix element for each of the KL modes. + """ + nl = self._get_nl_resamples(self.n_dens) + nij = nl*np.ones(self.n_ell)[:, None, None] + cij = self.cls + nij + sij = cij - nij + inv_nij = np.linalg.inv(nij) + metric = inv_nij + + def change_basis(c, m, ev): + return np.array([np.diag(np.dot(ev[i].T, + np.dot(m[i], + np.dot(c[i], + np.dot(m[i], + ev[i]))))) + for i in range(self.n_ell)]) + + def diagonalize(c, m): + im = np.linalg.inv(m) + ll = np.linalg.cholesky(m) + ill = np.linalg.cholesky(im) + cl = np.array([np.dot(np.transpose(ll[i]), + np.dot(c[i], ll[i])) + for i in range(self.n_ell)]) + c_p, v = np.linalg.eigh(cl) + ev = np.array([np.dot(np.transpose(ill[i]), v[i]) + for i in range(self.n_ell)]) + # iden = change_basis(im, m, ev) + return ev, c_p + + e_v, c_p = diagonalize(cij, metric) + s_p = change_basis(sij, metric, e_v) + nmodes_l = (self.fsky * self.d_ell * (self.larr+0.5)) + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + isort = np.argsort(-np.sum(fish_kl, axis=0)) + e_o = e_v[:, :, isort] + # f_o = np.array([np.dot(inv_nij[l], e_o[l, :, :]) + # for l in range(self.n_ell)]) + c_p = change_basis(cij, metric, e_o) + s_p = change_basis(sij, metric, e_o) + + fish_kl = nmodes_l[:, None]*(s_p/c_p)**2 + fish_permode = np.sum(fish_kl, axis=0) + fish_cumul = np.cumsum(fish_permode) + return fish_kl, fish_permode, fish_cumul + + def get_sn_wn(self, weights, n_dens, full_output=False): + """ Compute signal-to-noise ratio from weights matrix and number + densities of the resampled groups. + Args: + weights (array_like): weights matrix of shape + `(N_resample, N_initial)`, where `N_resample` is the + number of new groups, and `N_initial` is the original + number of groups. Each entry corresponds to the weight + which with a given initial group enters the new set of + groups. + n_dens (array_like): number density (in sterad^-1) of the + new groups. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + sl = self._get_cl_resamples(weights) + nl = self._get_nl_resamples(n_dens) + cl = sl + nl + icl = np.linalg.inv(cl) + sn2_1pt = np.sum(sl[:, :, :, None] * icl[:, None, :, :], axis=2) + sn2_ell = np.sum(sn2_1pt[:, :, :, None] * sn2_1pt[:, None, :, :], + axis=2) + trsn2_ell = np.trace(sn2_ell, axis1=1, axis2=2) + snr = np.sqrt(np.sum(trsn2_ell * (self.larr + 0.5) * + self.d_ell * self.fsky)) + + if full_output: + dret = {'snr': snr, 'cl': sl, 'nl': nl, 'ls': self.larr} + return dret + else: + return snr + + def get_sn(self, assign, full_output=False): + """ Compute signal-to-noise ratio from a bin assignment list. + Args: + assign (list): list of arrays. Each element of the list should + be a numpy array of integers, corresponding to the indices + of the initial groups that are now resampled into a larger + group. + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + w, ndens = self.get_resample_metadata(assign) + return self.get_sn_wn(w, ndens, + full_output=full_output) + + def check_edges(self, edges): + """ Returns `True` if there's something wrong with the edges. + """ + swapped = False + if np.ndim(edges) > 0: + swapped = np.any(np.diff(edges) < 0) + return np.any(edges < 0) or \ + np.any(edges > self.edges_large) or \ + swapped + + def get_nzs_from_edges(self, edges, assign_params=assign_params_default): + assign = self.assign_from_edges(edges, assign_params=assign_params) + nzs = np.array([np.sum(self.nz_list[a, :], axis=0) + for a in assign]) + return nzs + + def assign_from_edges(self, edges, assign_params=assign_params_default, + get_ids=False): + """ Get assignment list from edges and assign parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + get_ids (bool): if `True`, output assignment arrays will + be accompanied by the associated bin id (with -1 for the + trash bin). + Returns: + List of assignment arrays ready to be used in e.g. `get_sn`. + """ + edges = np.atleast_1d(edges) + nbins = len(edges) + 1 + # Bin IDs based on mean z + ids = np.digitize(self.z_means, bins=edges) + if assign_params['use_p_inbin'] or assign_params['use_p_outbin']: + def integrate_safe(p, z, m): + if np.sum(m) == 0: + return 0 + else: + return simps(p[m], x=z[m]) + # Calculate probabilities in each bin + zgroups = np.digitize(self.z_arr, bins=edges) + masks = [zgroups == i for i in range(nbins)] + if assign_params['use_p_outbin']: + # Matrix of probabilities in each bin + pzs = np.array([[integrate_safe(pz, self.z_arr, m) + for m in masks] + for pz in self.pz_list]) + pzd = np.array([pzs[j, ids[j]] + for j in range(self.n_samples)]) + else: + # Overlaps in own bin + pzd = np.array([integrate_safe(pz, self.z_arr, masks[i]) + for pz, i in zip(self.pz_list, ids)]) + # Throw away based on in-bin probability + if assign_params['use_p_inbin']: + ids[pzd < assign_params['p_inbin_thr']] = -1 + # Throw away based on off-bin probability + if assign_params['use_p_outbin']: + ids[np.array([np.sum(p > assign_params['p_outbin_thr']) > 1 + for p in pzs])] = -1 + # Assignment list + if get_ids: + return [(i, np.where(ids == i)[0]) for i in np.unique(ids)] + else: + return [np.where(ids == i)[0] for i in np.unique(ids)] + + def get_sn_from_edges(self, edges, full_output=False, + assign_params=assign_params_default): + """ Compute signal-to-noise ratio from a set of edges and assignment + parameters. + Args: + edges (array_like): array of bin edges. + assign_params (dict): dictionary of assignment parameters + (see `assign_params_default`). + full_output (bool): if true, a dictionary with additional + information will be returned. Otherwise just total S/N. + Returns: + If `full_output=True`, dictionary containing S/N, power + spectra and noise spectra. Otherwise just S/N. + """ + if self.check_edges(edges): + return 0 + assign = self.assign_from_edges(edges, assign_params=assign_params) + return self.get_sn(assign, full_output=full_output)