diff --git a/CHANGELOG.md b/CHANGELOG.md index 047b2012..490c0dfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,29 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added +- Command `fiboa improve` with helpers to + - change the CRS + - change the GeoParquet version and compression + - fill missing perimeter/area values + - fix invalid geometries + - rename columns - Converter for Lithuania (EuroCrops) -- Converter for Switzerland - Converter for Slovenia - Converter for Slovakia +- Converter for Switzerland +- `fiboa convert`: New parameter `--original-geometries` / `-og` to keep the original geometries + +### Changed + +- `fiboa convert`: + - Writes custom schemas to collection metadata + - Geometries are made valid using GeoPanda's `make_valid` method by default + - MultiPolygons are converted to Polygons by default +- `fiboa validate` uses custom schemas for validation +- `fiboa merge` keeps custom schemas when needed + +### Removed +- `fiboa convert`: Removed the explicit parameter `explode_multipolygon` from the converter ### Fixed diff --git a/README.md b/README.md index cc9f6cff..31699f62 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,12 @@ A command-line interface (CLI) for working with fiboa. ## Getting Started -In order to make working with fiboa easier we have developed command-line interface (CLI) tools such as +In order to make working with fiboa easier we have developed command-line interface (CLI) tools such as inspection, validation and file format conversions. ### Installation -You will need to have **Python 3.9** or any later version installed. +You will need to have **Python 3.9** or any later version installed. Run `pip install fiboa-cli` in the CLI to install the validator. @@ -21,6 +21,7 @@ Run `pip install fiboa-cli` in the CLI to install the validator. you can for example run: `pip install fiboa-cli[xyz]` with xyz being the converter name. **Note on versions:** + - fiboa CLI >= 0.3.0 works with fiboa version > 0.2.0 - fiboa CLI < 0.3.0 works with fiboa version = 0.1.0 @@ -44,6 +45,7 @@ fiboa CLI supports various commands to work with the files: - [Merge fiboa GeoParquet files](#merge-fiboa-geoparquet-files) - [Create JSON Schema from fiboa Schema](#create-json-schema-from-fiboa-schema) - [Validate a fiboa Schema](#validate-a-fiboa-schema) + - [Improve a fiboa Parquet file](#improve-a-fiboa-parquet-file) - [Update an extension template with new names](#update-an-extension-template-with-new-names) - [Converter for existing datasets](#converter-for-existing-datasets) - [Development](#development) @@ -121,19 +123,38 @@ To validate a fiboa Schema YAML file, you can for example run: Check `fiboa validate-schema --help` for more details. +### Improve a fiboa Parquet file + +Various "improvements" can be applied to a fiboa GeoParquet file. +The commands allows to + +- change the CRS (`--crs`) +- change the GeoParquet version (`-gp1`) and compression (`-pc`) +- add/fill missing perimeter/area values (`-sz`) +- fix invalid geometries (`-g`) +- rename columns (`-r`) + +Example: + +- `fiboa improve file.parquet -o file2.parquet -g -sz -r old=new -pc zstd` + +Check `fiboa improve --help` for more details. + ### Update an extension template with new names Once you've created and git cloned a new extension, you can use the CLI to update all template placeholders with proper names. For example, if your extension is meant to have -- the title "Timestamps Extension", + +- the title "Timestamps Extension", - the prefix `ts` (e.g. field `ts:created` or `ts:updated`), - is hosted at `https://github.io/fiboa/timestamps-extension` (organization: `fiboa`, repository `timestamps-extension`), - and you run fiboa in the folder of the extension. Then the following command could be used: + - `fiboa rename-extension . -t Timestamps -p ts -s timestamps-extension -o fiboa` Check `fiboa rename-extension --help` for more details. @@ -143,13 +164,14 @@ Check `fiboa rename-extension --help` for more details. The CLI ships various converters for existing datasets. To get a list of available converters/datasets with title, license, etc. run: + - `fiboa converters` Use any of the IDs from the list to convert an existing dataset to fiboa: - `fiboa convert de_nrw` -See [Implement a converter](#implement-a-converter) for details about how to +See [Implement a converter](#implement-a-converter) for details about how to ## Development diff --git a/fiboa_cli/__init__.py b/fiboa_cli/__init__.py index 45731ce4..75ad59a8 100644 --- a/fiboa_cli/__init__.py +++ b/fiboa_cli/__init__.py @@ -6,16 +6,18 @@ import click import pandas as pd +from .const import COMPRESSION_METHODS, CORE_COLUMNS from .convert import convert as convert_ from .convert import list_all_converter_ids, list_all_converters from .create_geojson import create_geojson as create_geojson_ from .create_geoparquet import create_geoparquet as create_geoparquet_ from .describe import describe as describe_ -from .merge import merge as merge_, DEFAULT_COLUMNS, DEFAULT_CRS +from .improve import improve as improve_ +from .merge import merge as merge_, DEFAULT_CRS from .jsonschema import jsonschema as jsonschema_ from .rename_extension import rename_extension as rename_extension_ from .util import (check_ext_schema_for_cli, log, parse_converter_input_files, - valid_file_for_cli, valid_file_for_cli_with_ext, + parse_map, valid_file_for_cli, valid_file_for_cli_with_ext, valid_files_folders_for_cli, valid_folder_for_cli) from .validate import validate as validate_ from .validate_schema import validate_schema as validate_schema_ @@ -376,7 +378,7 @@ def jsonschema(schema, out, fiboa_version, id_): ) @click.option( '--compression', '-pc', - type=click.Choice(["brotli", "gzip", "lz4", "snappy", "zstd", "none"]), + type=click.Choice(COMPRESSION_METHODS), help='Compression method for the Parquet file.', show_default=True, default="brotli" @@ -385,7 +387,7 @@ def jsonschema(schema, out, fiboa_version, id_): '--geoparquet1', '-gp1', is_flag=True, type=click.BOOL, - help='Enforces generating a GeoParquet 1.0 file bounding box. Defaults to GeoParquet 1.1 with bounding box.', + help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.', default=False ) @click.option( @@ -394,13 +396,20 @@ def jsonschema(schema, out, fiboa_version, id_): help='Url of mapping file. Some converters use additional sources with mapping data.', default=None ) -def convert(dataset, out, input, cache, source_coop, collection, compression, geoparquet1, mapping_file): +@click.option( + '--original-geometries', '-og', + is_flag=True, + type=click.BOOL, + help='Keep the source geometries as provided, i.e. this option disables that geomtries are made valid and converted to Polygons.', + default=False +) +def convert(dataset, out, input, cache, source_coop, collection, compression, geoparquet1, mapping_file, original_geometries): """ Converts existing field boundary datasets to fiboa. """ log(f"fiboa CLI {__version__} - Convert '{dataset}'\n", "success") try: - convert_(dataset, out, input, cache, source_coop, collection, compression, geoparquet1, mapping_file) + convert_(dataset, out, input, cache, source_coop, collection, compression, geoparquet1, mapping_file, original_geometries) except Exception as e: log(e, "error") sys.exit(1) @@ -518,7 +527,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None): multiple=True, help='Additional column names to include.', show_default=True, - default=DEFAULT_COLUMNS, + default=CORE_COLUMNS, ) @click.option( '--exclude', '-e', @@ -536,7 +545,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None): ) @click.option( '--compression', '-pc', - type=click.Choice(["brotli", "gzip", "lz4", "snappy", "zstd", "none"]), + type=click.Choice(COMPRESSION_METHODS), help='Compression method for the Parquet file.', show_default=True, default="brotli" @@ -545,7 +554,7 @@ def rename_extension(folder, title, slug, org = "fiboa", prefix = None): '--geoparquet1', '-gp1', is_flag=True, type=click.BOOL, - help='Enforces generating a GeoParquet 1.0 file bounding box. Defaults to GeoParquet 1.1 with bounding box.', + help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.', default=False ) def merge(datasets, out, crs, include, exclude, extension, compression, geoparquet1): @@ -564,6 +573,76 @@ def merge(datasets, out, crs, include, exclude, extension, compression, geoparqu sys.exit(1) +## IMPROVE (add area, perimeter, and fix geometries) +@click.command() +@click.argument('input', nargs=1, type=click.Path(exists=True)) +@click.option( + '--out', '-o', + type=click.Path(exists=False), + help='Path to write the GeoParquet file to. If not given, overwrites the input file.', + default=None +) +@click.option( + '--rename-column', '-r', + type=click.STRING, + callback=lambda ctx, param, value: parse_map(value), + multiple=True, + help='Renaming of columns. Provide the old name and the new name separated by an equal sign. Can be used multiple times.' +) +@click.option( + '--add-sizes', '-sz', + is_flag=True, + type=click.BOOL, + help='Computes missing sizes (area, perimeter)', + default=False +) +@click.option( + '--fix-geometries', '-g', + is_flag=True, + type=click.BOOL, + help='Tries to fix invalid geometries that are repored by the validator (uses GeoPanda\'s make_valid method internally)', + default=False +) +@click.option( + '--explode-geometries', '-e', + is_flag=True, + type=click.BOOL, + help='Converts MultiPolygons to Polygons', + default=False +) +@click.option( + '--crs', + type=click.STRING, + help='Coordinate Reference System (CRS) to use for the GeoParquet file.', + show_default=True, + default=None, +) +@click.option( + '--compression', '-pc', + type=click.Choice(COMPRESSION_METHODS), + help='Compression method for the Parquet file.', + show_default=True, + default="brotli" +) +@click.option( + '--geoparquet1', '-gp1', + is_flag=True, + type=click.BOOL, + help='Enforces generating a GeoParquet 1.0 file. Defaults to GeoParquet 1.1 with bounding box.', + default=False +) +def improve(input, out, rename_column, add_sizes, fix_geometries, explode_geometries, crs, compression, geoparquet1): + """ + "Improves" a fiboa GeoParquet file according to the given parameters. + """ + log(f"fiboa CLI {__version__} - Improve datasets\n", "success") + try: + improve_(input, out, rename_column, add_sizes, fix_geometries, explode_geometries, crs, compression, geoparquet1) + except Exception as e: + log(e, "error") + sys.exit(1) + + cli.add_command(describe) cli.add_command(validate) cli.add_command(validate_schema) @@ -574,6 +653,7 @@ def merge(datasets, out, crs, include, exclude, extension, compression, geoparqu cli.add_command(converters) cli.add_command(rename_extension) cli.add_command(merge) +cli.add_command(improve) if __name__ == '__main__': cli() diff --git a/fiboa_cli/const.py b/fiboa_cli/const.py index 4df8d47e..8f54bd14 100644 --- a/fiboa_cli/const.py +++ b/fiboa_cli/const.py @@ -10,3 +10,14 @@ STAC_COLLECTION_SCHEMA = "http://schemas.stacspec.org/v{version}/collection-spec/json-schema/collection.json" GEOPARQUET_SCHEMA = "https://geoparquet.org/releases/v{version}/schema.json" STAC_TABLE_EXTENSION = "https://stac-extensions.github.io/table/v1.2.0/schema.json" + +COMPRESSION_METHODS = ["brotli", "gzip", "lz4", "snappy", "zstd", "none"] + +CORE_COLUMNS = [ + "id", + "geometry", + "area", + "perimeter", + "determination_datetime", + "determination_method", +] diff --git a/fiboa_cli/convert.py b/fiboa_cli/convert.py index 84030b75..8f006ff5 100644 --- a/fiboa_cli/convert.py +++ b/fiboa_cli/convert.py @@ -13,7 +13,8 @@ def convert( collection = False, compression = None, geoparquet1 = False, - mapping_file=None, + mapping_file = None, + original_geometries = False, ): if dataset in IGNORED_DATASET_FILES: raise Exception(f"'{dataset}' is not a converter") @@ -37,6 +38,7 @@ def convert( compression = compression, geoparquet1 = geoparquet1, mapping_file = mapping_file, + original_geometries = original_geometries, ) def list_all_converter_ids(): diff --git a/fiboa_cli/convert_utils.py b/fiboa_cli/convert_utils.py index cec0c1fb..74865220 100644 --- a/fiboa_cli/convert_utils.py +++ b/fiboa_cli/convert_utils.py @@ -40,7 +40,7 @@ def convert( license = None, compression = None, geoparquet1 = False, - explode_multipolygon = False, + original_geometries = False, index_as_id = False, **kwargs): """ @@ -160,11 +160,12 @@ def convert( else: log(f"Column '{key}' not found in dataset, skipping migration", "warning") - # 4b. For geometry column, convert multipolygon type to polygon - if explode_multipolygon: + # 4b. For geometry column, fix geometries + if not original_geometries: + gdf.geometry = gdf.geometry.make_valid() gdf = gdf.explode() - if has_migration or has_col_migrations or has_col_filters or has_col_additions or explode_multipolygon: + if has_migration or has_col_migrations or has_col_filters or has_col_additions: log("GeoDataFrame after migrations and filters:") print(gdf.head()) diff --git a/fiboa_cli/datasets/be_wa.py b/fiboa_cli/datasets/be_wa.py index f4013fdc..5e32635c 100644 --- a/fiboa_cli/datasets/be_wa.py +++ b/fiboa_cli/datasets/be_wa.py @@ -91,7 +91,6 @@ def file_migration(data, path, uri, layer): license=LICENSE, layer_filter=lambda layer, uri: layer == LAYER, file_migration=file_migration, - explode_multipolygon=True, index_as_id=True, **kwargs ) diff --git a/fiboa_cli/datasets/ch.py b/fiboa_cli/datasets/ch.py index 86634e68..e265fbe0 100644 --- a/fiboa_cli/datasets/ch.py +++ b/fiboa_cli/datasets/ch.py @@ -69,7 +69,6 @@ def convert(output_file, cache = None, **kwargs): column_migrations=COLUMN_MIGRATIONS, column_filters=COLUMN_FILTERS, providers=PROVIDERS, - explode_multipolygon=True, index_as_id=True, fid_as_index=True, **kwargs diff --git a/fiboa_cli/datasets/ec_fr.py b/fiboa_cli/datasets/ec_fr.py index 392c5d34..de0ab2e8 100644 --- a/fiboa_cli/datasets/ec_fr.py +++ b/fiboa_cli/datasets/ec_fr.py @@ -25,6 +25,5 @@ def convert(output_file, cache = None, **kwargs): column_filters=base.COLUMN_FILTERS, attribution=base.ATTRIBUTION, license=LICENSE, - explode_multipolygon=True, **kwargs ) diff --git a/fiboa_cli/datasets/es_cat.py b/fiboa_cli/datasets/es_cat.py index 4e1f0fca..327d9199 100644 --- a/fiboa_cli/datasets/es_cat.py +++ b/fiboa_cli/datasets/es_cat.py @@ -35,7 +35,6 @@ COLUMN_MIGRATIONS = { "campanya": lambda col: pd.to_datetime(col, format='%Y'), - "geometry": lambda col: col.make_valid(), } MISSING_SCHEMAS = { @@ -62,6 +61,5 @@ def convert(output_file, cache = None, **kwargs): license=LICENSE, layer="CULTIUS_DUN2023", index_as_id=True, - explode_multipolygon=True, **kwargs ) diff --git a/fiboa_cli/datasets/fi.py b/fiboa_cli/datasets/fi.py index 743468b7..edec46c1 100644 --- a/fiboa_cli/datasets/fi.py +++ b/fiboa_cli/datasets/fi.py @@ -33,8 +33,6 @@ COLUMN_MIGRATIONS = { # Make year (1st january) from column "VUOSI" "VUOSI": lambda col: pd.to_datetime(col, format='%Y'), - # Todo: generate a generic solution for making geometries valid - "geometry": lambda col: col.make_valid() } def migrate(gdf): diff --git a/fiboa_cli/datasets/fr.py b/fiboa_cli/datasets/fr.py index 36c53b82..f448363f 100644 --- a/fiboa_cli/datasets/fr.py +++ b/fiboa_cli/datasets/fr.py @@ -62,6 +62,5 @@ def convert(output_file, cache = None, **kwargs): column_filters=COLUMN_FILTERS, attribution=ATTRIBUTION, license=LICENSE, - explode_multipolygon=True, **kwargs ) diff --git a/fiboa_cli/datasets/ie.py b/fiboa_cli/datasets/ie.py index b903128f..79e2cffd 100644 --- a/fiboa_cli/datasets/ie.py +++ b/fiboa_cli/datasets/ie.py @@ -67,6 +67,5 @@ def file_migration(data, path, uri, layer): license=LICENSE, layer_filter=lambda layer, uri: layer == LAYER, file_migration=file_migration, - explode_multipolygon=True, **kwargs ) diff --git a/fiboa_cli/datasets/nl.py b/fiboa_cli/datasets/nl.py index f63bbefd..a70a62dd 100644 --- a/fiboa_cli/datasets/nl.py +++ b/fiboa_cli/datasets/nl.py @@ -80,7 +80,6 @@ def convert(output_file, cache = None, **kwargs): migration=migrate, attribution=ATTRIBUTION, license=LICENSE, - explode_multipolygon=True, index_as_id=True, **kwargs ) diff --git a/fiboa_cli/datasets/template.py b/fiboa_cli/datasets/template.py index b9a5ae27..22af22a2 100644 --- a/fiboa_cli/datasets/template.py +++ b/fiboa_cli/datasets/template.py @@ -168,7 +168,6 @@ def convert(output_file, cache = None, **kwargs): attribution=ATTRIBUTION, license=LICENSE, # Other options: - # explode_multipolygon=True/False, # Converts MultiPolygons to Polygons # index_as_id=True/False, # Adds a column "id" with the index of the GeoDataFrame **kwargs ) diff --git a/fiboa_cli/describe.py b/fiboa_cli/describe.py index c698f8c8..32649a6e 100644 --- a/fiboa_cli/describe.py +++ b/fiboa_cli/describe.py @@ -1,7 +1,7 @@ import json import pandas as pd -from .util import log, load_parquet_data, load_parquet_schema, load_parquet_metadata, parse_metadata, log_extensions +from .util import log, load_parquet_data, load_parquet_schema, load_parquet_metadata, parse_metadata, log_extensions, is_schema_empty def describe(file, display_json=False, num=10, columns=None): metadata = load_parquet_metadata(file) @@ -26,6 +26,10 @@ def describe(file, display_json=False, num=10, columns=None): if "fiboa_extensions" in collection and isinstance(collection["fiboa_extensions"], list): log_extensions(collection, log) + custom_schemas = collection.get("fiboa_custom_schemas", {}) + if not is_schema_empty(custom_schemas): + log("Custom schemas: " + ", ".join(custom_schemas["properties"].keys())) + if (display_json): log(json.dumps(collection, indent=2)) elif "stac_version" in collection: diff --git a/fiboa_cli/improve.py b/fiboa_cli/improve.py new file mode 100644 index 00000000..b9d8efe8 --- /dev/null +++ b/fiboa_cli/improve.py @@ -0,0 +1,78 @@ +import os + +from .const import CORE_COLUMNS +from .parquet import create_parquet +from .util import load_parquet_data, load_parquet_schema, log, parse_metadata, pick_schemas, is_schema_empty + + +def improve(input, out = None, rename_columns = {}, add_sizes = False, fix_geometries = False, explode_geometries = False, crs = None, compression = None, geoparquet1 = False): + # Prepare and determine location of the output file + if not out: + out = input + else: + dir = os.path.dirname(out) + if dir: + os.makedirs(dir, exist_ok=True) + + # Load the dataset + schema = load_parquet_schema(input) + collection = parse_metadata(schema, b"fiboa") + columns = list(schema.names) + # Remove the bbox column to avoid conflicts when writing GeoParquet file later + columns.remove("bbox") + gdf = load_parquet_data(input, columns = columns) + + # Change the CRS + if crs is not None: + gdf.to_crs(crs=crs, inplace=True) + log(f"Changed CRS to {crs}", "info") + + # Fix geometries + if fix_geometries: + gdf.geometry = gdf.geometry.make_valid() + log("Fixed geometries", "info") + + # Convert MultiPolygons to Polygons + if explode_geometries: + gdf = gdf.explode() + log("Exploded geometries", "info") + + # Rename columns + if len(rename_columns) > 0: + for col in rename_columns: + columns[columns.index(col)] = rename_columns[col] + if col in CORE_COLUMNS: + log(f"Column {col} is a fiboa core field - do you really want to rename it?", "warning") + if ":" in col: + log(f"Column {col} may be a fiboa extension field - do you really want to rename it?", "warning") + gdf.rename(columns=rename_columns, inplace=True) + log("Renamed columns", "info") + + # Add sizes + if add_sizes: + # Add the area and perimeter columns + for name in ["area", "perimeter"]: + if name not in columns: + # Create column if not present + gdf[name] = None + columns.append(name) + + gdf_m = gdf + # Determine whether the given CRS is in meters + if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]: + # Reproject the geometries to an equal-area projection if needed + gdf_m = gdf.to_crs("EPSG:6933") + + # Compute the missing area and perimeter values + gdf["area"] = gdf_m["area"].fillna(gdf_m.geometry.area * 0.0001) + gdf["perimeter"] = gdf_m["perimeter"].fillna(gdf_m.geometry.length) + + custom_schemas = collection.get("fiboa_custom_schemas", {}) + custom_schemas = pick_schemas(custom_schemas, columns, rename_columns) + if not is_schema_empty(custom_schemas): + collection["fiboa_custom_schemas"] = custom_schemas + + + # Write the merged dataset to the output file + create_parquet(gdf, columns, collection, out, {}, compression=compression, geoparquet1=geoparquet1) + log(f"Wrote data to {out}", "success") diff --git a/fiboa_cli/merge.py b/fiboa_cli/merge.py index 4833310e..7e315277 100644 --- a/fiboa_cli/merge.py +++ b/fiboa_cli/merge.py @@ -2,18 +2,11 @@ import pandas as pd +from .const import CORE_COLUMNS from .parquet import create_parquet -from .util import load_parquet_data, load_parquet_schema, log, parse_metadata +from .util import load_parquet_data, load_parquet_schema, log, merge_schemas, parse_metadata, pick_schemas, is_schema_empty from .version import fiboa_version -DEFAULT_COLUMNS = [ - "id", - "geometry", - "area", - "perimeter", - "determination_datetime", - "determination_method", -] DEFAULT_CRS = "EPSG:4326" def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extensions = [], compression = None, geoparquet1 = False): @@ -21,15 +14,17 @@ def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extens if dir: os.makedirs(dir, exist_ok=True) - columns = DEFAULT_COLUMNS.copy() + columns = CORE_COLUMNS.copy() columns.extend(includes) columns = list(set(columns) - set(excludes)) # Load the datasets all_gdf = [] + custom_schemas = {} for dataset in datasets: # Load the dataset schema = load_parquet_schema(dataset) + collection = parse_metadata(schema, b"fiboa") file_columns = list(set(columns) & set(schema.names)) gdf = load_parquet_data(dataset, columns=file_columns) @@ -37,12 +32,15 @@ def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extens gdf.to_crs(crs=crs, inplace=True) # Add collection column to each dataset - collection = parse_metadata(schema, b"fiboa") if collection is not None and "id" in collection: gdf["collection"] = collection["id"] else: gdf["collection"] = os.path.splitext(os.path.basename(dataset))[0] + # Merge custom schemas + custom_schema = collection.get("fiboa_custom_schemas", {}) + custom_schemas = merge_schemas(custom_schemas, custom_schema) + all_gdf.append(gdf) merged = pd.concat(all_gdf, ignore_index=True) @@ -58,6 +56,11 @@ def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extens "fiboa_extensions": extensions, } + # Add custom schemas + custom_schemas = pick_schemas(custom_schemas, columns) + if not is_schema_empty(custom_schemas): + collection["fiboa_custom_schemas"] = custom_schemas + # Write the merged dataset to the output file create_parquet(merged, columns, collection, out, {}, compression=compression, geoparquet1=geoparquet1) log(f"Merged data to {out}", "success") diff --git a/fiboa_cli/parquet.py b/fiboa_cli/parquet.py index 43e03e19..11cc159a 100644 --- a/fiboa_cli/parquet.py +++ b/fiboa_cli/parquet.py @@ -5,7 +5,7 @@ from shapely.geometry import shape from .types import get_geopandas_dtype, get_pyarrow_type_for_geopandas, get_pyarrow_field -from .util import log, load_fiboa_schema, load_file, merge_schemas +from .util import log, load_fiboa_schema, load_file, merge_schemas, is_schema_empty from .geopandas import to_parquet ROW_GROUP_SIZE = 25000 @@ -15,6 +15,11 @@ def create_parquet(data, columns, collection, output_file, config, missing_schem fiboa_schema = load_fiboa_schema(config) schemas = merge_schemas(missing_schemas, fiboa_schema) + # Add the custom schemas to the collection for future use + if not is_schema_empty(missing_schemas): + collection = collection.copy() + collection["fiboa_custom_schemas"] = missing_schemas + # Load all extension schemas extensions = {} if "fiboa_extensions" in collection and isinstance(collection["fiboa_extensions"], list): diff --git a/fiboa_cli/util.py b/fiboa_cli/util.py index 1e9d46a6..fffb990a 100644 --- a/fiboa_cli/util.py +++ b/fiboa_cli/util.py @@ -224,6 +224,22 @@ def parse_converter_input_files(ctx, param, value): return sources +def parse_map(value, separator = "="): + if value is None: + return {} + elif not isinstance(value, tuple): + raise click.BadParameter('Input files must be a tuple') + elif len(value) == 0: + return {} + + mapping = {} + for v in value: + key, value = v.split(separator, 2) + mapping[key] = value + + return mapping + + def name_from_uri(url): if "://" in url: try: @@ -246,6 +262,10 @@ def check_ext_schema_for_cli(value, allow_none = False): return map_ +def is_schema_empty(schema): + return len(schema.get("properties", {})) == 0 and len(schema.get("required", {})) == 0 + + def merge_schemas(*schemas): """Merge multiple schemas into one""" result = { @@ -260,6 +280,24 @@ def merge_schemas(*schemas): return result +def pick_schemas(schema, property_names, rename = {}): + """Pick and rename schemas for specific properties""" + result = { + "required": [], + "properties": {} + } + required = schema.get("required", []) + properties = schema.get("properties", {}) + for prop in property_names: + prop2 = rename[prop] if prop in rename else prop + if prop in required: + result["required"].append(prop2) + if prop in properties: + result["properties"][prop2] = properties[prop] + + return result + + def migrate_schema(schema): """Migrate schema to a new version""" return schema.copy() diff --git a/fiboa_cli/validate.py b/fiboa_cli/validate.py index 14c1bb42..08c0ac31 100644 --- a/fiboa_cli/validate.py +++ b/fiboa_cli/validate.py @@ -207,6 +207,10 @@ def validate_parquet(file, config): for ext in extensions.values(): schemas = merge_schemas(schemas, ext) + # Add custom schemas + custom_schemas = collection.get("fiboa_custom_schemas", {}) + schemas = merge_schemas(schemas, custom_schemas) + # Check that all required fields are present for key in schemas.get("required", []): if key not in parquet_schema.names: diff --git a/tests/test_improve.py b/tests/test_improve.py new file mode 100644 index 00000000..b5e0634d --- /dev/null +++ b/tests/test_improve.py @@ -0,0 +1,16 @@ +from fiboa_cli import improve, validate +from click.testing import CliRunner +from glob import glob + + +def test_improve(tmp_file): + # merge files in directory + files = glob("tests/data-files/merge/at.parquet") + runner = CliRunner() + args = files + ['-o', tmp_file.name, '-sz', '-g', '-e'] + result = runner.invoke(improve, args) + assert result.exit_code == 0, result.output + + # Merged parquet file should be valid + result = runner.invoke(validate, [tmp_file.name, '--data']) + assert result.exit_code == 0, result.output diff --git a/tests/test_merge.py b/tests/test_merge.py index 085d5e8b..0f3c75d5 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -1,5 +1,3 @@ -from geopandas._compat import import_optional_dependency - from fiboa_cli import merge, validate from click.testing import CliRunner from glob import glob @@ -12,7 +10,7 @@ """ -def test_converter(tmp_file): +def test_merge(tmp_file): # merge files in directory files = glob("tests/data-files/merge/*.parquet") runner = CliRunner()