Source code for bfgn.data_management.training_data

import copy
import fiona
import gdal
import logging
import numpy as np
import ogr
import osr
import os
from tqdm import tqdm
from typing import List, Tuple

from bfgn.configuration import configs, sections
from bfgn.data_management import common_io, ooc_functions, data_core

_logger = logging.getLogger(__name__)


[docs]def build_training_data_ordered( config: configs.Config, feature_raw_band_types: List[str], response_raw_band_types: List[str] ) -> Tuple[List[np.array], List[np.array], List[np.array], List[str], List[str], List[np.array], List[np.array]]: # TODO: check default, and set it to a standard value if config.data_build.random_seed: _logger.debug('Setting random seed to {}'.format(config.data_build.random_seed)) np.random.seed(config.data_build.random_seed) # TODO: move to config checks if (isinstance(config.data_build.max_samples, list)): if (len(config.data_build.max_samples) != len(config.raw_files.feature_files)): raise Exception('max_samples must equal feature_files length, or be an integer.') n_features = int(np.sum([len(feat_type) for feat_type in feature_raw_band_types])) n_responses = int(np.sum([len(resp_type) for resp_type in response_raw_band_types])) # TODO: move to checks feature_memmap_size_gb = n_features*4*config.data_build.max_samples * \ (config.data_build.window_radius*2)**2 / 1024.**3 assert feature_memmap_size_gb < config.data_build.max_built_data_gb,\ 'Expected feature memmap size: {} Gb, limit: {}'.format( feature_memmap_size_gb, config.data_build.max_built_data_gb) response_memmap_size_gb = n_responses*4*config.data_build.max_samples * \ (config.data_build.window_radius*2)**2 / 1024.**3 assert response_memmap_size_gb < config.data_build.max_built_data_gb,\ 'Expected feature memmap size: {} Gb, limit: {}'.format( response_memmap_size_gb, config.data_build.max_built_data_gb) features, responses = _open_temporary_features_responses_data_files(config, n_features, n_responses) _log_munged_data_information(features, responses) _logger.debug('Pre-compute all subset locations') all_site_upper_lefts = [] all_site_xy_locations = [] all_site_xy_sizes = [] reference_subset_geotransforms = [] for _site in range(0, len(config.raw_files.feature_files)): feature_sets = [gdal.Open(loc_file, gdal.GA_ReadOnly) for loc_file in config.raw_files.feature_files[_site]] response_sets = [common_io.noerror_open(loc_file) for loc_file in config.raw_files.response_files[_site]] boundary_set = common_io.get_site_boundary_set(config, _site) all_set_upper_lefts, xy_sample_locations, xy_max_size = \ common_io.get_all_interior_extent_subset_pixel_locations( gdal_datasets=[feature_sets, [rs for rs in response_sets if rs is not None], [bs for bs in [boundary_set] if bs is not None]], window_radius=config.data_build.window_radius, inner_window_radius=config.data_build.loss_window_radius, shuffle=True, return_xy_size=True) all_site_upper_lefts.append(all_set_upper_lefts) all_site_xy_locations.append(xy_sample_locations) all_site_xy_sizes.append(xy_max_size) ref_trans = feature_sets[0].GetGeoTransform() ref_subset_geotransform = [ref_trans[0]+all_set_upper_lefts[0][0][0]*ref_trans[1], ref_trans[1], 0, ref_trans[3]+all_set_upper_lefts[0][0][1]*ref_trans[5], 0, ref_trans[5]] reference_subset_geotransforms.append(ref_subset_geotransform) if (config.data_build.sparse_read is True): _logger.info('Sparse read is on, so use a pass the response file to thin down the training points to read') for _site in range(0, len(config.raw_files.feature_files)): response_sets = [common_io.noerror_open(loc_file) for loc_file in config.raw_files.response_files[_site]] boundary_set = common_io.get_site_boundary_set(config, _site) valid_locations = np.ones(all_site_xy_locations[_site].shape[0]).astype(bool) for _row in tqdm(range(0, all_site_xy_sizes[_site][1]), ncols=80, desc='Sparse-data filter'): subset = np.squeeze(np.where(all_site_xy_locations[_site][:, 1] == _row)) if (len(subset) > 0): if (boundary_set is not None): bound_dat = common_io.read_chunk_by_row(boundary_set, all_site_upper_lefts[_site][-1], all_site_xy_sizes[_site][0], config.data_build.loss_window_radius*2, _row, nodata_value=config.raw_files.boundary_bad_value) for _x_loc in range(len(subset)): x_loc = all_site_xy_locations[_site][subset[_x_loc], 0] boundary_fraction = np.sum(np.isnan( bound_dat[:,x_loc:x_loc+config.data_build.loss_window_radius*2])) / float(config.data_build.loss_window_radius**2) _logger.debug('Sparsity check boundary fraction: {}'.format(boundary_fraction)) if (boundary_fraction > config.data_build.response_nodata_maximum_fraction): valid_locations[subset[_x_loc]] = False if (np.any(valid_locations[subset])): col_dat = common_io.read_chunk_by_row(response_sets, all_site_upper_lefts[_site][len( config.raw_files.feature_files[_site])], all_site_xy_sizes[_site][0], config.data_build.loss_window_radius*2, _row, nodata_value=config.raw_files.response_nodata_value) for _x_loc in range(len(subset)): x_loc = all_site_xy_locations[_site][subset[_x_loc], 0] response_fractions = np.sum( np.isnan(col_dat[:,x_loc:x_loc+config.data_build.loss_window_radius*2]),axis=(0,1)) / float(config.data_build.loss_window_radius**2) _logger.debug('Sparsity check response fraction: {}'.format(response_fractions)) if (np.any(response_fractions > config.data_build.response_nodata_maximum_fraction)): valid_locations[subset[_x_loc]] = False _logger.info('Filtering {} out of {} points from sprasity check.'.format( len(valid_locations) - np.sum(valid_locations), len(valid_locations))) all_site_xy_locations[_site] = all_site_xy_locations[_site][valid_locations, :] num_reads_per_site = int(np.floor(config.data_build.max_samples / len(config.raw_files.feature_files))) _logger.debug('Step through sites in order and grab {} sample from each until max samples'.format(num_reads_per_site)) progress_bar = tqdm(total=config.data_build.max_samples, ncols=80, desc='Reading data chunks') _sample_index = 0 _rs_index = 0 remaining_sites = list(range(len(config.raw_files.feature_files))) _site_xy_index = np.zeros(len(remaining_sites)).astype(int).tolist() while (_sample_index < config.data_build.max_samples and len(remaining_sites) > 0): _site = remaining_sites[_rs_index] _logger.debug('Reading loop: Site {}'.format(_site)) while _site_xy_index[_site] < len(all_site_xy_locations[_site]): site_read_count = 0 _logger.debug('Site index: {}'.format(_site_xy_index[_site])) success = read_segmentation_chunk(_site, all_site_upper_lefts[_site], all_site_xy_locations[_site][_site_xy_index[_site], :].copy(), config, reference_subset_geotransforms[_site], _sample_index) _site_xy_index[_site] += 1 if (_site_xy_index[_site] >= len(all_site_xy_locations[_site])): _logger.debug('All locations in site {} have been checked.'.format( config.raw_files.feature_files[_site][0])) remaining_sites.pop(_rs_index) _rs_index += 1 if (_rs_index >= len(remaining_sites)): _rs_index = 0 if success is True: _sample_index += 1 progress_bar.update(1) site_read_count += 1 if (site_read_count > num_reads_per_site or _sample_index >= config.data_build.max_samples): break progress_bar.close() del all_site_upper_lefts del all_site_xy_locations del reference_subset_geotransforms features, responses = _open_temporary_features_responses_data_files(config, n_features, n_responses, read_type='r+') features = _resize_munged_features(features, _sample_index, config) responses = _resize_munged_responses(responses, _sample_index, config) assert features.shape[0] > 0, 'Failed to collect a single sample' _log_munged_data_information(features, responses) _logger.debug('Shuffle data to avoid fold assignment biases') perm = np.random.permutation(features.shape[0]) features = ooc_functions.permute_array(features, data_core.get_temporary_features_filepath(config), perm) responses = ooc_functions.permute_array(responses, data_core.get_temporary_responses_filepath(config), perm) del perm _logger.debug('Create uniform weights') shape = tuple(list(features.shape)[:-1] + [1]) weights = np.memmap(data_core.get_temporary_weights_filepath(config), dtype=np.float32, mode='w+', shape=shape) weights[:, :, :, :] = 1 _logger.debug('Remove weights for missing responses') weights[np.isnan(responses[..., 0])] = 0 _logger.debug('Remove weights outside loss window') if (config.data_build.loss_window_radius != config.data_build.window_radius): buf = config.data_build.window_radius - config.data_build.loss_window_radius weights[:, :buf, :, -1] = 0 weights[:, -buf:, :, -1] = 0 weights[:, :, :buf, -1] = 0 weights[:, :, -buf:, -1] = 0 _log_munged_data_information(features, responses, weights) _logger.debug('One-hot encode features') features, feature_band_types, feature_per_band_encoded_values = ooc_functions.one_hot_encode_array( feature_raw_band_types, features, data_core.get_temporary_features_filepath(config)) _logger.debug('One-hot encode responses') responses, response_band_types, response_per_band_encoded_values = ooc_functions.one_hot_encode_array( response_raw_band_types, responses, data_core.get_temporary_responses_filepath(config)) _log_munged_data_information(features, responses, weights) _save_built_data_files(features, responses, weights, config) del features, responses, weights if ('C' in response_raw_band_types): assert np.sum(np.array(response_raw_band_types) == 'C') == 1, \ 'Weighting is currently only enabled for one categorical response variable.' features, responses, weights = load_built_data_files(config, writeable=True) weights = calculate_categorical_weights(responses, weights, config) _logger.debug('Delete in order to flush output') del features, responses, weights _remove_temporary_data_files(config) _logger.debug('Store data build config sections') _save_built_data_config_sections_to_verify_successful(config) features, responses, weights = load_built_data_files(config, writeable=False) return features, responses, weights, feature_band_types, response_band_types, feature_per_band_encoded_values, \ response_per_band_encoded_values
[docs]def build_training_data_from_response_points( config: configs.Config, feature_raw_band_types: List[str], response_raw_band_types: List[str] ) -> Tuple[List[np.array], List[np.array], List[np.array], List[str], List[str], List[np.array], List[np.array]]: _logger.info('Build training data from response points') if (config.data_build.random_seed is not None): np.random.seed(config.data_build.random_seed) num_features = np.sum([len(feat_type) for feat_type in feature_raw_band_types]) feature_memmap_size_gb = num_features*4*config.data_build.max_samples * \ (config.data_build.window_radius*2)**2 / 1024.**3 assert feature_memmap_size_gb < config.data_build.max_built_data_gb, \ 'Expected feature memmap size: {} Gb, limit: {}'.format( feature_memmap_size_gb, config.data_build.max_built_data_gb) xy_sample_points_per_site = [] responses_per_site = [] reference_subset_geotransforms = [] for _site in range(0, len(config.raw_files.feature_files)): _logger.debug('Run through first response') if (config.raw_files.response_files[_site][0].split('.')[-1] in sections.VECTORIZED_FILENAMES): assert(len(config.raw_files.response_files[_site]) == 1), 'Only 1 vector file per site supported for CNN' feature_sets = [gdal.Open(loc_file, gdal.GA_ReadOnly) for loc_file in config.raw_files.feature_files[_site]] boundary_set = common_io.get_site_boundary_set(config, _site) lower_coord, upper_coord = common_io.get_overlapping_extent_coordinates( [feature_sets, [], [bs for bs in [boundary_set] if bs is not None]]) resolution = np.squeeze(np.array(feature_sets[0].GetGeoTransform()))[np.array([1, 5])] vector = fiona.open(config.raw_files.response_files[_site][0]) x_sample_points = [] y_sample_points = [] band_responses = [] for _sample in tqdm(range(len(vector)), ncols=80, desc='Reading responses'): if (vector[_sample]['geometry']['type'] == 'Point'): x_sample_points.append(vector[_sample]['geometry']['coordinates'][0]) y_sample_points.append(vector[_sample]['geometry']['coordinates'][1]) band_responses.append(vector[_sample]['properties'][config.raw_files.response_vector_property_name]) if (x_sample_points[-1] < lower_coord[0] + resolution[0]*config.data_build.window_radius or x_sample_points[-1] > upper_coord[0] - resolution[0]*config.data_build.window_radius or y_sample_points[-1] < upper_coord[1] + resolution[1]*config.data_build.window_radius or y_sample_points[-1] > lower_coord[1] - resolution[1]*config.data_build.window_radius): x_sample_points.pop(-1) y_sample_points.pop(-1) band_responses.pop(-1) else: _logger.info('WARNING: Non point gemoetry ignored in file {}'.format( config.raw_files.response_files[_site][0])) vector.close() xy_sample_points = np.vstack([np.array(x_sample_points), np.array(y_sample_points)]).T responses_per_file = np.array(np.array(band_responses).reshape(-1, 1)) reference_subset_geotransforms.append(list(feature_sets[0].GetGeoTransform())) else: # TODO: Needs completion assert False, 'Mode not yet supported' xy_sample_points_per_site.append(xy_sample_points) responses_per_site.append(responses_per_file) total_samples = sum([site_responses.shape[0] for site_responses in responses_per_site]) _logger.debug('Found {} total samples across {} sites'.format(total_samples, len(responses_per_site))) assert total_samples > 0, 'need at least 1 valid sample...' if total_samples > config.data_build.max_samples: _logger.debug('Discard samples because the number of valid samples ({}) exceeds the max samples requested ({})' .format(total_samples, config.data_build.max_samples)) prop_samples_kept_per_site = config.data_build.max_samples / total_samples for _site in range(len(responses_per_site)): num_samples = len(responses_per_site[_site]) num_samples_kept = int(prop_samples_kept_per_site * num_samples) idxs_kept = np.random.permutation(num_samples)[:num_samples_kept] responses_per_site[_site] = responses_per_site[_site][idxs_kept] xy_sample_points_per_site[_site] = xy_sample_points_per_site[_site][idxs_kept, :] _logger.debug('Site {} had {} valid samples, kept {} samples'.format(_site, num_samples, num_samples_kept)) total_samples = sum([site_responses.shape[0] for site_responses in responses_per_site]) _logger.debug('Kept {} total samples across {} sites after discarding'.format( total_samples, len(responses_per_site))) unique_responses = [] for _site in range(len(responses_per_site)): un_site = np.unique(responses_per_site[_site]).tolist() unique_responses.extend(un_site) unique_responses = np.unique(unique_responses) for _response in range(len(unique_responses)): _logger.info('Response: {} encoded as: {}'.format(unique_responses[_response], _response)) for _site in range(len(responses_per_site)): revised_responses = np.zeros(responses_per_site[_site].shape) for _response in range(len(unique_responses)): revised_responses[responses_per_site[_site] == unique_responses[_response]] = _response responses_per_site[_site] = revised_responses.copy() features = np.memmap( data_core.get_temporary_features_filepath(config), dtype=np.float32, mode='w+', shape=(total_samples, 2*config.data_build.window_radius, 2*config.data_build.window_radius, num_features) ) sample_index = 0 for _site in range(0, len(config.raw_files.feature_files)): _logger.debug('Open feature and response datasets for site {}'.format(_site)) feature_sets = [gdal.Open(loc_file, gdal.GA_ReadOnly) for loc_file in config.raw_files.feature_files[_site]] boundary_set = common_io.get_site_boundary_set(config, _site) _logger.debug('Calculate interior rectangle location and extent') lower_coord, upper_coord = common_io.get_overlapping_extent_coordinates( [feature_sets, [], [bs for bs in [boundary_set] if bs is not None]]) resolution = np.squeeze(np.array(feature_sets[0].GetGeoTransform()))[np.array([1, 5])] # xy_sample_locations is current the response centers, but we need to use the pixel ULs. So subtract # out the corresponding feature radius # xy_sample_locations = (xy_sample_points_per_site[_site] - np.array(lower_coord))*resolution - \ # config.data_build.window_radius xy_sample_locations = xy_sample_points_per_site[_site] ref_trans = feature_sets[0].GetGeoTransform() subset_geotransform = None if config.raw_files.boundary_files is not None: if config.raw_files.boundary_files[_site] is not None and \ _is_vector_file(config.raw_files.boundary_files[_site]): subset_geotransform = [ref_trans[0], ref_trans[1], 0, ref_trans[3], 0, ref_trans[5]] good_response_data = np.zeros(responses_per_site[_site].shape[0]).astype(bool) # Now read in features for _cr in tqdm(range(len(xy_sample_locations)), ncols=80, desc='Reading features'): local_xy_px_locations = [] if (boundary_set is not None): gt = boundary_set.GetGeoTransform() local_xy_px_locations.append([int((xy_sample_locations[_cr, 0] - gt[0])/gt[1]), int((xy_sample_locations[_cr, 1] - gt[3])/gt[5])]) else: local_xy_px_locations.append([0, 0]) # doesn't matter the value, won't get used for _file in range(len(feature_sets)): gt = feature_sets[_file].GetGeoTransform() local_xy_px_locations.append([int((xy_sample_locations[_cr, 0] - gt[0])/gt[1]), int((xy_sample_locations[_cr, 1] - gt[3])/gt[5])]) local_feature = read_labeling_chunk(_site, local_xy_px_locations, config, reference_subset_geotransforms[_site]) # Make sure that the feature space also has data - the fact that the response space had valid data is no # guarantee that the feature space does. if local_feature is not None: _logger.debug('Save sample data; {} samples saved total'.format(sample_index + 1)) features[sample_index, ...] = local_feature.copy() good_response_data[_cr] = True sample_index += 1 responses_per_site[_site] = responses_per_site[_site][good_response_data] _logger.debug('{} samples saved for site {}'.format(responses_per_site[_site].shape[0], _site)) assert sample_index > 0, 'Insufficient feature data corresponding to response data. Consider increasing maximum feature nodata size' # transform responses responses_raw = np.vstack(responses_per_site).astype(np.float32) del responses_per_site responses = np.memmap( data_core.get_temporary_responses_filepath(config), dtype=np.float32, mode='w+', shape=(sample_index, 1) ) responses[:] = responses_raw[:] del responses_raw _log_munged_data_information(features, responses) features = _resize_munged_features(features, sample_index, config) _log_munged_data_information(features, responses) _logger.debug('Shuffle data to avoid fold assignment biases') perm = np.random.permutation(features.shape[0]) features = ooc_functions.permute_array(features, data_core.get_temporary_features_filepath(config), perm) responses = ooc_functions.permute_array(responses, data_core.get_temporary_responses_filepath(config), perm) del perm weights = np.ones((responses.shape[0], 1)) _log_munged_data_information(features, responses, weights) # one hot encode features, feature_band_types, feature_per_band_encoded_values = ooc_functions.one_hot_encode_array( feature_raw_band_types, features, data_core.get_temporary_features_filepath(config)) responses, response_band_types, response_per_band_encoded_values = ooc_functions.one_hot_encode_array( response_raw_band_types, responses, data_core.get_temporary_responses_filepath(config)) _log_munged_data_information(features, responses, weights) # This change happens to work in this instance, but will not work with all sampling strategies. I will leave for # now and refactor down the line as necessary. Generally speaking, fold assignments are specific to the style of data read _save_built_data_files(features, responses, weights, config) del features, responses, weights if 'C' in response_raw_band_types: assert np.sum(np.array(response_raw_band_types) == 'C') == 1, \ 'Weighting is currently only enabled for one categorical response variable.' features, responses, weights = load_built_data_files(config, writeable=True) weights = calculate_categorical_weights(responses, weights, config) del features, responses, weights _remove_temporary_data_files(config) _save_built_data_config_sections_to_verify_successful(config) features, responses, weights = load_built_data_files(config, writeable=False) return features, responses, weights, feature_band_types, response_band_types, feature_per_band_encoded_values, response_per_band_encoded_values
[docs]def get_proj(fname: str) -> str: """ Get the projection of a raster/vector dataset. :param str fname: Name of input file :return The projection of the input fname """ ds = common_io.noerror_open(fname) if (ds is not None): proj = ds.GetProjection() if (proj is not None): srs = osr.SpatialReference(wkt=str(proj)) if srs.IsProjected is False: exception_str = 'File {} is not projected. Correct or set ignore_projections flag.'.format(fname) raise Exception(exception_str) return srs.GetAttrValue('projcs') if (os.path.basename(fname).split('.')[-1] == 'shp'): vset = ogr.GetDriverByName('ESRI Shapefile').Open(fname, gdal.GA_ReadOnly) elif (os.path.basename(fname).split('.')[-1] == 'kml'): vset = ogr.GetDriverByName('KML').Open(fname, gdal.GA_ReadOnly) else: exception_str = 'Cannot find projection from file {}'.format(fname) raise Exception(exception_str) if (vset is None): exception_str = 'Cannot find projection from file {}'.format(fname) raise Exception(exception_str) else: proj = vset.GetLayer().GetSpatialRef() if (proj is None): exception_str = 'Cannot find projection from file {}'.format(fname) raise Exception(exception_str) else: srs = osr.SpatialReference(wkt=str(proj)) if srs.IsProjected is False: exception_str = 'File {} is not projected. Correct or set ignore_projections flag.'.format(fname) raise Exception(exception_str) return srs.GetAttrValue('projcs')
[docs]def check_projections(f_files: List[List[str]], r_files: List[List[str]], b_files: List[str] = None) -> List[str]: # f = feature, r = response, b = boundary errors = [] if (b_files is None): b_files = [] site_f_proj = [] site_r_proj = [] site_b_proj = [] for _site in range(len(f_files)): f_proj = [] r_proj = [] for _file in range(len(f_files[_site])): f_proj.append(get_proj(f_files[_site][_file])) for _file in range(len(r_files[_site])): r_proj.append(get_proj(r_files[_site][_file])) b_proj = None if (len(b_files) > 0): b_proj = get_proj(b_files[_site]) un_f_proj = np.unique(f_proj) if (len(un_f_proj) > 1): errors.append('Feature projection mismatch at site {}, projections: {}'.format(_site, un_f_proj)) un_r_proj = np.unique(r_proj) if (len(un_r_proj) > 1): errors.append('Response projection mismatch at site {}, projections: {}'.format(_site, un_r_proj)) if (un_f_proj[0] != un_r_proj[0]): errors.append( 'Feature/Response projection mismatch at site {}\nFeature proj: {}\nResponse proj: {}'.format(_site, un_f_proj[0], un_r_proj[0])) if (b_proj is not None): if (un_f_proj[0] != b_proj): errors.append( 'Feature/Boundary projection mismatch at site {}\nFeature proj: {}\nBoundary proj: {}'.format(_site, un_f_proj[0], b_proj)) if (un_r_proj[0] != b_proj): errors.append( 'Response/Boundary projection mismatch at site {}\nFeature proj: {}\nBoundary proj: {}'.format(_site, un_r_proj[0], b_proj)) site_f_proj.append(un_f_proj[0]) site_r_proj.append(un_r_proj[0]) if (b_proj is not None): site_b_proj.append(b_proj) if (len(np.unique(site_f_proj)) > 1): errors.append('Warning, different projections at different features sites: {}'.format(site_f_proj)) if (len(np.unique(site_r_proj)) > 1): errors.append('Warning, different projections at different features sites: {}'.format(site_r_proj)) if (len(np.unique(site_b_proj)) > 1): errors.append('Warning, different projections at different features sites: {}'.format(site_b_proj)) return errors
[docs]def check_resolutions(f_files: List[List[str]], r_files: List[List[str]], b_files: List[str] = None) -> List[str]: # f = feature, r = response, b = boundary errors = [] if (b_files is None): b_files = [] site_f_res = [] site_r_res = [] site_b_res = [] for _site in range(len(f_files)): f_res = [] r_res = [] for _file in range(len(f_files[_site])): f_res.append(np.array(gdal.Open(f_files[_site][_file], gdal.GA_ReadOnly).GetGeoTransform())[[1, 5]]) for _file in range(len(r_files[_site])): if (not _is_vector_file(r_files[_site][_file])): r_res.append(np.array(gdal.Open(r_files[_site][_file], gdal.GA_ReadOnly).GetGeoTransform())[[1, 5]]) b_res = None if (len(b_files) > 0): if (not _is_vector_file(b_files[_site])): b_res = np.array(gdal.Open(b_files[_site], gdal.GA_ReadOnly).GetGeoTransform())[[1, 5]] un_f_res = [] if (len(f_res) > 0): f_res = np.vstack(f_res) un_f_res = [np.unique(f_res[:, 0]), np.unique(f_res[:, 1])] if (len(un_f_res[0]) > 1 or len(un_f_res[1]) > 1): errors.append('Feature resolution mismatch at site {}, resolutions: {}'.format(_site, un_f_res)) un_r_res = [] if (len(r_res) > 0): r_res = np.vstack(r_res) un_r_res = [np.unique(r_res[:, 0]), np.unique(r_res[:, 1])] if (len(un_r_res[0]) > 1 or len(un_r_res[1]) > 1): errors.append('Response resolution mismatch at site {}, resolutions: {}'.format(_site, un_r_res)) if (b_res is not None): if (un_f_res[0][0] != b_res[0] or un_f_res[1][0] != b_res[1]): errors.append('Feature/Boundary resolution mismatch at site {}'.format(_site)) if (len(un_r_res) > 0): if (un_r_res[0][0] != b_res[0] or un_r_res[1][0] != b_res[1]): errors.append('Response/Boundary resolution mismatch at site {}'.format(_site)) site_f_res.append(un_f_res[0]) if (len(un_r_res) > 0): site_r_res.append(un_r_res[0]) if (b_res is not None): site_b_res.append(b_res) if (len(np.unique(site_f_res)) > 1): _logger.info('Warning, different resolutions at different features sites: {}'.format(site_f_res)) if (len(np.unique(site_r_res)) > 1): _logger.info('Warning, different resolutions at different features sites: {}'.format(site_r_res)) if (len(np.unique(site_b_res)) > 1): _logger.info('Warning, different resolutions at different features sites: {}'.format(site_b_res)) return errors
# Calculates categorical weights for a single response
[docs]def calculate_categorical_weights( responses: List[np.array], weights: List[np.array], config: configs.Config, batch_size: int = 100 ) -> List[np.array]: # find upper and lower boud lb = config.data_build.window_radius - config.data_build.loss_window_radius ub = -lb # get response/total counts (batch-wise) response_counts = np.zeros(responses[0].shape[-1]) total_valid_count = 0 for idx_array, response_array in enumerate(responses): if idx_array in (config.data_build.validation_fold, config.data_build.test_fold): continue for ind in range(0, response_array.shape[0], batch_size): if (lb == 0 or len(response_array.shape) < 3): lr = response_array[ind:ind+batch_size, ...] else: lr = response_array[ind:ind+batch_size, lb:ub, lb:ub, ...] lr[lr == config.raw_files.response_nodata_value] = np.nan total_valid_count += np.sum(np.isfinite(lr)) for _r in range(0, len(response_counts)): response_counts[_r] += np.nansum(lr[..., _r] == 1) # assign_weights for _array in range(len(responses)): for ind in range(0, responses[_array].shape[0], batch_size): lr = (responses[_array])[ind:ind+batch_size, ...] lrs = list(lr.shape) lrs.pop(-1) lw = np.zeros((lrs)) for _r in range(0, len(response_counts)): lw[lr[..., _r] == 1] = total_valid_count / response_counts[_r] if (lb != 0 and len(lw.shape) == 3): lw[:, :lb, :] = 0 lw[:, ub:, :] = 0 lw[:, :, :lb] = 0 lw[:, :, ub:] = 0 lws = list(lw.shape) lws.extend([1]) lw = lw.reshape(lws) weights[_array][ind:ind+batch_size, ...] = lw return weights
[docs]def read_labeling_chunk(_site: int, offset_from_ul: List[List[int]], config: configs.Config, reference_geotransform: List[float] ) -> np.array: window_diameter = config.data_build.window_radius * 2 geotransform = copy.deepcopy(reference_geotransform) geotransform[0] += offset_from_ul[0][0] * geotransform[1] geotransform[3] += offset_from_ul[0][1] * geotransform[5] # for _f in range(len(offset_from_ul)): # print(f_ul[_f]) # if (np.any(np.array(offset_from_ul[_f]) < config.data_build.window_radius)): # _logger.debug('Feature read OOB') # return None # if (f_ul[_f][0] > f_sets[_f].RasterXSize - config.data_build.window_radius): # _logger.debug('Feature read OOB') # return None # if (f_ul[_f][1] > f_sets[_f].RasterYSize - config.data_build.window_radius): # _logger.debug('Feature read OOB') # return None mask = common_io.read_mask_chunk(_site, offset_from_ul.pop(0), window_diameter, geotransform, config) if not _check_mask_data_sufficient(mask, config.data_build.feature_nodata_maximum_fraction): return False local_feature, mask = common_io.read_map_subset(config.raw_files.feature_files[_site], offset_from_ul, window_diameter, mask, config.raw_files.feature_nodata_value) if not _check_mask_data_sufficient(mask, config.data_build.feature_nodata_maximum_fraction): _logger.debug('Insufficient feature data') return None # Final check (propogate mask forward), and return local_feature[mask, :] = np.nan return local_feature
[docs]def read_segmentation_chunk(_site: int, all_file_upper_lefts: List[List[int]], offset_from_ul: List[int], config: configs.Config, reference_geotransform: List[float], sample_index: int) -> bool: window_diameter = config.data_build.window_radius * 2 [f_ul, r_ul, [b_ul]] = copy.deepcopy(all_file_upper_lefts) geotransform = copy.deepcopy(reference_geotransform) geotransform[0] += offset_from_ul[0] * geotransform[1] geotransform[3] += offset_from_ul[1] * geotransform[5] f_ul += offset_from_ul for _r in range(len(r_ul)): if r_ul[_r] is not None: r_ul[_r] += offset_from_ul if (b_ul is not None): b_ul += offset_from_ul mask = common_io.read_mask_chunk(_site, b_ul, window_diameter, geotransform, config) if not _check_mask_data_sufficient(mask, config.data_build.feature_nodata_maximum_fraction): return False response_mask = mask.copy() local_response, response_mask = common_io.read_map_subset(config.raw_files.response_files[_site], r_ul, window_diameter, response_mask, config.raw_files.response_nodata_value, lower_bound=config.data_build.response_min_value, upper_bound=config.data_build.response_max_value, reference_geotransform=geotransform) if not _check_mask_data_sufficient(response_mask, config.data_build.response_nodata_maximum_fraction): return False feature_mask = mask.copy() local_feature, feature_mask = common_io.read_map_subset(config.raw_files.feature_files[_site], f_ul, window_diameter, feature_mask, config.raw_files.feature_nodata_value) if not _check_mask_data_sufficient(feature_mask, config.data_build.feature_nodata_maximum_fraction): return False if config.data_build.response_background_values: if np.all(np.in1d(local_response, config.data_build.response_background_values)): return False # Final check (propogate mask forward), and return local_feature[feature_mask, :] = np.nan local_response[response_mask, :] = np.nan features, responses = _open_temporary_features_responses_data_files( config, local_feature.shape[-1], local_response.shape[-1], read_type='r+') features[sample_index, ...] = local_feature responses[sample_index, ...] = local_response del features, responses return True
################### File/Dataset Opening Functions ############################## def _open_temporary_features_responses_data_files(config: configs.Config, num_features: int, num_responses: int, read_type: str = 'w+') -> Tuple[np.array, np.array]: basename = data_core.get_memmap_basename(config) shape = [config.data_build.max_samples, config.data_build.window_radius * 2, config.data_build.window_radius * 2] shape_features = tuple(shape + [num_features]) shape_responses = tuple(shape + [num_responses]) features_filepath = data_core.get_temporary_features_filepath(config) responses_filepath = data_core.get_temporary_responses_filepath(config) _logger.debug('Create temporary munged features data file with shape {} at {}'.format( shape_features, features_filepath)) features = np.memmap(features_filepath, dtype=np.float32, mode=read_type, shape=shape_features) _logger.debug('Create temporary munged responses data file with shape {} at {}'.format( shape_responses, responses_filepath)) responses = np.memmap(responses_filepath, dtype=np.float32, mode=read_type, shape=shape_responses) #features_dataset = h5py.File(features_filepath, read_type) #responses_dataset = h5py.File(responses_filepath, read_type) # if (read_type == 'w'): # features = features_dataset.create_dataset('features', shape_features, dtype=np.float32, chunks=True) # responses = responses_dataset.create_dataset('responses', shape_responses, dtype=np.float32, chunks=True) # else: # features = features_dataset['features'] # responses = responses_dataset['responses'] return features, responses def _create_temporary_weights_data_files(config: configs.Config, num_samples: int) -> np.array: weights_filepath = data_core.get_temporary_weights_filepath(config) _logger.debug('Create temporary munged weights data file at {}'.format(weights_filepath)) shape = tuple([num_samples, config.data_build.window_radius * 2, config.data_build.window_radius * 2, 1]) return np.memmap(weights_filepath, dtype=np.float32, mode='w+', shape=shape)
[docs]def load_built_data_files(config: configs.Config, writeable: bool = False) \ -> Tuple[List[np.array], List[np.array], List[np.array]]: _logger.debug('Loading built data files with writeable == {}'.format(writeable)) feature_files = data_core.get_built_features_filepaths(config) response_files = data_core.get_built_responses_filepaths(config) weight_files = data_core.get_built_weights_filepaths(config) mode = 'r+' if writeable else 'r' features = [np.load(feature_file, mmap_mode=mode) for feature_file in feature_files] responses = [np.load(response_file, mmap_mode=mode) for response_file in response_files] weights = [np.load(weight_file, mmap_mode=mode) for weight_file in weight_files] _logger.debug('Built data files loaded') return features, responses, weights
################### Save/remove Functions ############################## def _save_built_data_files( features_munged: np.array, responses_munged: np.array, weights_munged: np.array, config: configs.Config ) -> None: _logger.debug('Create fold assignments') fold_assignments = np.zeros(features_munged.shape[0]).astype(int) for f in range(0, config.data_build.number_folds): idx_start = int(round(f / config.data_build.number_folds * len(fold_assignments))) idx_finish = int(round((f + 1) / config.data_build.number_folds * len(fold_assignments))) fold_assignments[idx_start:idx_finish] = f _logger.debug('Save features to memmapped arrays separated by folds') features_filepaths = data_core.get_built_features_filepaths(config) for idx_fold in range(config.data_build.number_folds): _logger.debug('Save features fold {}'.format(idx_fold)) np.save(features_filepaths[idx_fold], features_munged[fold_assignments == idx_fold, ...]) _logger.debug('Save responses to memmapped arrays separated by folds') responses_filepaths = data_core.get_built_responses_filepaths(config) for idx_fold in range(config.data_build.number_folds): _logger.debug('Save responses fold {}'.format(idx_fold)) np.save(responses_filepaths[idx_fold], responses_munged[fold_assignments == idx_fold, ...]) _logger.debug('Save weights to memmapped arrays separated by folds') weights_filepaths = data_core.get_built_weights_filepaths(config) for idx_fold in range(config.data_build.number_folds): _logger.debug('Save weights fold {}'.format(idx_fold)) np.save(weights_filepaths[idx_fold], weights_munged[fold_assignments == idx_fold, ...]) def _save_built_data_config_sections_to_verify_successful(config: configs.Config) -> None: filepath = data_core.get_built_data_config_filepath(config) _logger.debug('Saving built data config sections to {}'.format(filepath)) configs.save_config_to_file(config, filepath, include_sections=['raw_files', 'data_build']) def _remove_temporary_data_files(config: configs.Config) -> None: _logger.debug('Remove temporary munge files') if os.path.exists(data_core.get_temporary_features_filepath(config)): os.remove(data_core.get_temporary_features_filepath(config)) if os.path.exists(data_core.get_temporary_responses_filepath(config)): os.remove(data_core.get_temporary_responses_filepath(config)) if os.path.exists(data_core.get_temporary_weights_filepath(config)): os.remove(data_core.get_temporary_weights_filepath(config)) ################### Array Resizing Functions ############################## def _resize_munged_features(features_munged: np.array, num_samples: int, config: configs.Config) -> np.array: _logger.debug('Resize memmapped features array with out-of-memory methods; original features shape {}'.format( features_munged.shape)) new_features_shape = tuple([num_samples] + list(features_munged.shape[1:])) _logger.debug('Delete in-memory data to force data dump') del features_munged _logger.debug('Reload data from memmap files with modified sizes; new features shape {}'.format( new_features_shape)) features_munged = np.memmap( data_core.get_temporary_features_filepath(config), dtype=np.float32, mode='r+', shape=new_features_shape) return features_munged def _resize_munged_responses(responses_munged: np.array, num_samples: int, config: configs.Config) -> np.array: _logger.debug('Resize memmapped responses array with out-of-memory methods; original responses shape {}'.format( responses_munged.shape)) new_responses_shape = tuple([num_samples] + list(responses_munged.shape[1:])) _logger.debug('Delete in-memory data to force data dump') del responses_munged _logger.debug('Reload data from memmap files with modified sizes; new responses shape {}'.format( new_responses_shape)) responses_munged = np.memmap( data_core.get_temporary_responses_filepath(config), dtype=np.float32, mode='r+', shape=new_responses_shape) return responses_munged ################### Verification Functions ############################## def _check_mask_data_sufficient(mask: np.array, max_nodata_fraction: float) -> bool: if mask is not None: nodata_fraction = np.sum(mask) / np.prod(mask.shape) if nodata_fraction <= max_nodata_fraction: _logger.debug('Data mask has sufficient data, missing data proportion: {}'.format(nodata_fraction)) return True else: _logger.debug('Data mask has insufficient data, missing data proportion: {}'.format(nodata_fraction)) return False else: _logger.debug('Data mask is None') return False def _is_vector_file(boundary_filepath: str) -> bool: return str(boundary_filepath.split('.')[-1]).lower() in sections.VECTORIZED_FILENAMES
[docs]def check_built_data_files_exist(config: configs.Config) -> bool: filepaths = \ data_core.get_built_features_filepaths(config) + \ data_core.get_built_responses_filepaths(config) + \ data_core.get_built_weights_filepaths(config) missing_files = [filepath for filepath in filepaths if not os.path.exists(filepath)] if not missing_files: _logger.debug('Built data files found at paths: {}'.format(', '.join(filepaths))) else: _logger.info('Built data files were not found at paths: {}'.format(', '.join(missing_files))) return not missing_files
################### Logging Functions ############################## def _log_munged_data_information( features_munged: np.array = None, responses_munged: np.array = None, weights_munged: np.array = None ) -> None: if features_munged is not None: _logger.info('Munged features shape: {}'.format(features_munged.shape)) if responses_munged is not None: _logger.info('Munged responses shape: {}'.format(responses_munged.shape)) if weights_munged is not None: _logger.info('Munged weights shape: {}'.format(weights_munged.shape))