diff --git a/docs/user_guide.md b/docs/user_guide.md index a2ffd9c8..a39d6db3 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -625,9 +625,9 @@ hedpy schema add-ids /path/to/hed-schemas score 2.2.0 1. Validate schema thoroughly before adding IDs 2. Convert to all formats and verify equivalence 3. Add HED IDs only once - they should remain stable -4. Generate ontology after IDs are added -5. Verify that the created ontology is valid using [Protégé](https://protege.stanford.edu/) -6. Commit changes to version control before moving to stable release +4. Commit changes to version control before moving to stable release + +**Note:** Ontology generation (OMN/OWL format) has been moved to the separate [hed-ontology](https://github.com/hed-standard/hed-ontology) repository. ______________________________________________________________________ diff --git a/hed/schema/schema_attribute_validator_hed_id.py b/hed/schema/schema_attribute_validator_hed_id.py index 599f5364..fb1c99e3 100644 --- a/hed/schema/schema_attribute_validator_hed_id.py +++ b/hed/schema/schema_attribute_validator_hed_id.py @@ -1,4 +1,4 @@ -from hed.schema.schema_io.ontology_util import get_library_data +from hed.schema.hed_cache import get_library_data from hed.schema.schema_io.df_util import remove_prefix from semantic_version import Version from hed.schema.hed_schema_io import load_schema_version diff --git a/hed/schema/schema_io/hed_id_util.py b/hed/schema/schema_io/hed_id_util.py new file mode 100644 index 00000000..b362926b --- /dev/null +++ b/hed/schema/schema_io/hed_id_util.py @@ -0,0 +1,236 @@ +"""Utility functions for HED ID assignment and validation. + +This module handles HED ID ranges, validation, and assignment for schema elements. +For ontology/OMN conversion functionality, see the hed-ontology repository. +""" + +import pandas as pd + +from hed.schema.schema_io import schema_util +from hed.errors.exceptions import HedFileError +from hed.schema.hed_schema_constants import HedKey +from hed.schema.schema_io.df_util import remove_prefix +from hed.schema.hed_cache import get_library_data +from hed.schema.schema_io import df_constants as constants + +object_type_id_offset = { + constants.OBJECT_KEY: (100, 300), + constants.DATA_KEY: (300, 500), + constants.ANNOTATION_KEY: (500, 700), + constants.ATTRIBUTE_PROPERTY_KEY: (700, 900), + constants.VALUE_CLASS_KEY: (1300, 1400), + constants.UNIT_MODIFIER_KEY: (1400, 1500), + constants.UNIT_CLASS_KEY: (1500, 1600), + constants.UNIT_KEY: (1600, 1700), + constants.TAG_KEY: (2000, -1), # -1 = go to end of range +} + + +def _get_hedid_range(schema_name, df_key): + """Get the set of HedId's for this object type/schema name. + + Parameters: + schema_name(str): The known schema name with an assigned id range. + df_key(str): The dataframe range type we're interested in. a key from constants.DF_SUFFIXES. + + Returns: + set: A set of all id's in the requested range. + """ + if df_key == constants.STRUCT_KEY: + raise NotImplementedError("Cannot assign hed_ids struct section") + + library_data = get_library_data(schema_name) + if not library_data: + return set() + starting_id, ending_id = library_data["id_range"] + + start_object_range, end_object_range = object_type_id_offset[df_key] + if df_key == constants.TAG_KEY: + initial_tag_adj = 1 # We always skip 1 for tags + else: + initial_tag_adj = 0 + final_start = starting_id + start_object_range + initial_tag_adj + final_end = starting_id + end_object_range + if end_object_range == -1: + # Add one since the versions on hed-schemas are set to max_value - 1 + final_end = ending_id + 1 + return set(range(final_start, final_end)) + + +def get_all_ids(df): + """Returns a set of all unique hedIds in the dataframe + + Parameters: + df(pd.DataFrame): The dataframe + + Returns: + Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set. + """ + if constants.hed_id in df.columns: + modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_")) + modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) + return set(modified_df.unique()) + return None + + +def update_dataframes_from_schema(dataframes, schema, schema_name="", assign_missing_ids=False): + """Write out schema as a dataframe, then merge in extra columns from dataframes. + + Parameters: + dataframes(dict): A full set of schema spreadsheet formatted dataframes + schema(HedSchema): The schema to write into the dataframes: + schema_name(str): The name to use to find the schema id range. + assign_missing_ids(bool): If True, replacing any blank(new) HedIds with valid ones + + Returns: + dict[str:pd.DataFrames]: The updated dataframes. These dataframes can potentially have extra columns. + + """ + hedid_errors = [] + if not schema_name: + schema_name = schema.library + # 1. Verify existing HED ids don't conflict between schema/dataframes + for df_key, df in dataframes.items(): + if df_key in constants.DF_SUFFIXES: + continue + section_key = constants.section_mapping_hed_id.get(df_key) + if not section_key: + continue + section = schema[section_key] + + unused_tag_ids = _get_hedid_range(schema_name, df_key) + hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids) + + if hedid_errors: + raise HedFileError( + hedid_errors[0]["code"], + f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " + f"parameter on this exception for more details.", + schema.name, + issues=hedid_errors, + ) + + # 2. Get the new schema as DFs + from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive + + output_dfs = Schema2DF().process_schema(schema, save_merged=False) + + if assign_missing_ids: + # 3: Add any HED ID's as needed to these generated dfs + for df_key, df in output_dfs.items(): + if df_key == constants.STRUCT_KEY or df_key in constants.DF_EXTRAS: + continue + unused_tag_ids = _get_hedid_range(schema_name, df_key) + + # If no errors, assign new HED ID's + assign_hed_ids_section(df, unused_tag_ids) + + # 4: Merge the dataframes + for df_key in output_dfs.keys(): + if df_key in constants.DF_EXTRAS: + continue + out_df = output_dfs[df_key] + df = dataframes[df_key] + merge_dfs(out_df, df) + + return output_dfs + + +def _verify_hedid_matches(section, df, unused_tag_ids): + """Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema + + Parameters: + section(HedSchemaSection): The loaded schema section to compare ID's with + df(pd.DataFrame): The loaded spreadsheet dataframe to compare with + unused_tag_ids(set): The valid range of IDs for this df. + + Returns: + list[str]: A list of errors found matching IDs. + """ + hedid_errors = [] + for row_number, row in df.iterrows(): + if not any(row): + continue + label = row[constants.name] + if label.endswith("-#"): + label = label.replace("-#", "/#") + df_id = row[constants.hed_id] + entry = section.get(label) + if not entry: + # Neither side has a hedID, so nothing to do. + if not df_id: + continue + hedid_errors += schema_util.format_error( + row_number, row, f"'{label}' does not exist in schema file only the spreadsheet." + ) + continue + entry_id = entry.attributes.get(HedKey.HedID) + if df_id: + if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")): + hedid_errors += schema_util.format_error( + row_number, row, f"'{label}' has an improperly formatted hedID in dataframe." + ) + continue + id_value = remove_prefix(df_id, "HED_") + try: + id_int = int(id_value) + if id_int not in unused_tag_ids: + hedid_errors += schema_util.format_error( + row_number, + row, + f"'{label}' has id {id_int} which is outside " + + "of the valid range for this type. Valid range is: " + + f"{min(unused_tag_ids)} to {max(unused_tag_ids)}", + ) + continue + except ValueError: + hedid_errors += schema_util.format_error( + row_number, row, f"'{label}' has a non-numeric hedID in the dataframe." + ) + continue + + if entry_id and entry_id != df_id: + hedid_errors += schema_util.format_error( + row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema." + ) + continue + + return hedid_errors + + +def assign_hed_ids_section(df, unused_tag_ids): + """Adds missing HedIds to dataframe. + + Parameters: + df(pd.DataFrame): The dataframe to add id's to. + unused_tag_ids(set of int): The possible HED id's to assign from + """ + # Remove already used ids + unused_tag_ids -= get_all_ids(df) + sorted_unused_ids = sorted(unused_tag_ids, reverse=True) + + for _row_number, row in df.iterrows(): + hed_id = row[constants.hed_id] + # we already verified existing ones + if hed_id: + continue + hed_id = f"HED_{sorted_unused_ids.pop():07d}" + row[constants.hed_id] = hed_id + + +def merge_dfs(dest_df, source_df): + """Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df. + + Parameters: + dest_df (DataFrame): The dataframe to add extra columns to + source_df (DataFrame): The dataframe to get extra columns from + """ + # todo: vectorize this at some point + save_df1_columns = dest_df.columns.copy() + for _index, row in source_df.iterrows(): + # Find matching index in df1 based on 'rdfs:label' + match_index = dest_df[dest_df["rdfs:label"] == row["rdfs:label"]].index + if not match_index.empty: + for col in source_df.columns: + if col not in save_df1_columns: + dest_df.at[match_index[0], col] = row[col] diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py deleted file mode 100644 index cb85111f..00000000 --- a/hed/schema/schema_io/ontology_util.py +++ /dev/null @@ -1,480 +0,0 @@ -"""Utility functions for saving as an ontology or dataframe.""" - -import pandas as pd - -from hed.schema.schema_io import schema_util, df_constants as constants -from hed.errors.exceptions import HedFileError -from hed.schema.hed_schema_constants import HedKey -from hed.schema.schema_io.df_util import remove_prefix, calculate_attribute_type, get_attributes_from_row -from hed.schema.hed_cache import get_library_data - -object_type_id_offset = { - constants.OBJECT_KEY: (100, 300), - constants.DATA_KEY: (300, 500), - constants.ANNOTATION_KEY: (500, 700), - constants.ATTRIBUTE_PROPERTY_KEY: (700, 900), - constants.VALUE_CLASS_KEY: (1300, 1400), - constants.UNIT_MODIFIER_KEY: (1400, 1500), - constants.UNIT_CLASS_KEY: (1500, 1600), - constants.UNIT_KEY: (1600, 1700), - constants.TAG_KEY: (2000, -1), # -1 = go to end of range -} - - -def _get_hedid_range(schema_name, df_key): - """Get the set of HedId's for this object type/schema name. - - Parameters: - schema_name(str): The known schema name with an assigned id range. - df_key(str): The dataframe range type we're interested in. a key from constants.DF_SUFFIXES. - - Returns: - set: A set of all id's in the requested range. - """ - if df_key == constants.STRUCT_KEY: - raise NotImplementedError("Cannot assign hed_ids struct section") - - library_data = get_library_data(schema_name) - if not library_data: - return set() - starting_id, ending_id = library_data["id_range"] - - start_object_range, end_object_range = object_type_id_offset[df_key] - if df_key == constants.TAG_KEY: - initial_tag_adj = 1 # We always skip 1 for tags - else: - initial_tag_adj = 0 - final_start = starting_id + start_object_range + initial_tag_adj - final_end = starting_id + end_object_range - if end_object_range == -1: - # Add one since the versions on hed-schemas are set to max_value - 1 - final_end = ending_id + 1 - return set(range(final_start, final_end)) - - -def get_all_ids(df): - """Returns a set of all unique hedIds in the dataframe - - Parameters: - df(pd.DataFrame): The dataframe - - Returns: - Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set. - """ - if constants.hed_id in df.columns: - modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_")) - modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) - return set(modified_df.unique()) - return None - - -def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids=False, assign_missing_ids=False): - """Write out schema as a dataframe, then merge in extra columns from dataframes. - - Parameters: - dataframes(dict): A full set of schema spreadsheet formatted dataframes - schema(HedSchema): The schema to write into the dataframes: - schema_name(str): The name to use to find the schema id range. - get_as_ids(bool): If True, replace all known references with HedIds - assign_missing_ids(bool): If True, replacing any blank(new) HedIds with valid ones - - Returns: - dict[str:pd.DataFrames]: The updated dataframes. These dataframes can potentially have extra columns. - - """ - hedid_errors = [] - if not schema_name: - schema_name = schema.library - # 1. Verify existing HED ids don't conflict between schema/dataframes - for df_key, df in dataframes.items(): - if df_key in constants.DF_SUFFIXES: - continue - section_key = constants.section_mapping_hed_id.get(df_key) - if not section_key: - continue - section = schema[section_key] - - unused_tag_ids = _get_hedid_range(schema_name, df_key) - hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids) - - if hedid_errors: - raise HedFileError( - hedid_errors[0]["code"], - f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " - f"parameter on this exception for more details.", - schema.name, - issues=hedid_errors, - ) - - # 2. Get the new schema as DFs - from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive - - output_dfs = Schema2DF(get_as_ids=get_as_ids).process_schema(schema, save_merged=False) - - if assign_missing_ids: - # 3: Add any HED ID's as needed to these generated dfs - for df_key, df in output_dfs.items(): - if df_key == constants.STRUCT_KEY or df_key in constants.DF_EXTRAS: - continue - unused_tag_ids = _get_hedid_range(schema_name, df_key) - - # If no errors, assign new HED ID's - assign_hed_ids_section(df, unused_tag_ids) - - # 4: Merge the dataframes - for df_key in output_dfs.keys(): - if df_key in constants.DF_EXTRAS: - continue - out_df = output_dfs[df_key] - df = dataframes[df_key] - merge_dfs(out_df, df) - - return output_dfs - - -def _verify_hedid_matches(section, df, unused_tag_ids): - """Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema - - Parameters: - section(HedSchemaSection): The loaded schema section to compare ID's with - df(pd.DataFrame): The loaded spreadsheet dataframe to compare with - unused_tag_ids(set): The valid range of IDs for this df. - - Returns: - list[str]: A list of errors found matching IDs. - """ - hedid_errors = [] - for row_number, row in df.iterrows(): - if not any(row): - continue - label = row[constants.name] - if label.endswith("-#"): - label = label.replace("-#", "/#") - df_id = row[constants.hed_id] - entry = section.get(label) - if not entry: - # Neither side has a hedID, so nothing to do. - if not df_id: - continue - hedid_errors += schema_util.format_error( - row_number, row, f"'{label}' does not exist in schema file only the spreadsheet." - ) - continue - entry_id = entry.attributes.get(HedKey.HedID) - if df_id: - if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")): - hedid_errors += schema_util.format_error( - row_number, row, f"'{label}' has an improperly formatted hedID in dataframe." - ) - continue - id_value = remove_prefix(df_id, "HED_") - try: - id_int = int(id_value) - if id_int not in unused_tag_ids: - hedid_errors += schema_util.format_error( - row_number, - row, - f"'{label}' has id {id_int} which is outside " - + "of the valid range for this type. Valid range is: " - + f"{min(unused_tag_ids)} to {max(unused_tag_ids)}", - ) - continue - except ValueError: - hedid_errors += schema_util.format_error( - row_number, row, f"'{label}' has a non-numeric hedID in the dataframe." - ) - continue - - if entry_id and entry_id != df_id: - hedid_errors += schema_util.format_error( - row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema." - ) - continue - - return hedid_errors - - -def assign_hed_ids_section(df, unused_tag_ids): - """Adds missing HedIds to dataframe. - - Parameters: - df(pd.DataFrame): The dataframe to add id's to. - unused_tag_ids(set of int): The possible HED id's to assign from - """ - # Remove already used ids - unused_tag_ids -= get_all_ids(df) - sorted_unused_ids = sorted(unused_tag_ids, reverse=True) - - for _row_number, row in df.iterrows(): - hed_id = row[constants.hed_id] - # we already verified existing ones - if hed_id: - continue - hed_id = f"HED_{sorted_unused_ids.pop():07d}" - row[constants.hed_id] = hed_id - - -def merge_dfs(dest_df, source_df): - """Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df. - - Parameters: - dest_df (DataFrame): The dataframe to add extra columns to - source_df (DataFrame): The dataframe to get extra columns from - """ - # todo: vectorize this at some point - save_df1_columns = dest_df.columns.copy() - for _index, row in source_df.iterrows(): - # Find matching index in df1 based on 'rdfs:label' - match_index = dest_df[dest_df["rdfs:label"] == row["rdfs:label"]].index - if not match_index.empty: - for col in source_df.columns: - if col not in save_df1_columns: - dest_df.at[match_index[0], col] = row[col] - - -def _get_annotation_prop_ids(schema): - annotation_props = {} - for entry in schema.attributes.values(): - attribute_type = calculate_attribute_type(entry) - - if attribute_type == "annotation": - annotation_props[entry.name] = entry.attributes[HedKey.HedID] - - for entry in schema.properties.values(): - annotation_props[entry.name] = entry.attributes[HedKey.HedID] - - return annotation_props - - -def get_prefixes(dataframes): - """Get the prefixes and external annotation terms from the dataframes for ontology conversion.""" - prefixes = dataframes.get(constants.PREFIXES_KEY) - extensions = dataframes.get(constants.EXTERNAL_ANNOTATION_KEY) - sources = dataframes.get(constants.SOURCES_KEY) - if prefixes is None or extensions is None: - return {} - prefixes.columns = prefixes.columns.str.lower() - all_prefixes = {prefix.prefix: prefix[2] for prefix in prefixes.itertuples()} - extensions.columns = extensions.columns.str.lower() - sources.columns = sources.columns.str.lower() - annotation_terms = {} - for row in extensions.itertuples(): - annotation_terms[row.prefix + row.id] = all_prefixes[row.prefix] - source_dict = {} - for row in sources.itertuples(): - source_dict[row.source] = row.link - return annotation_terms, source_dict - - -def convert_df_to_omn(dataframes): - """Convert the dataframe format schema to omn format. - - Parameters: - dataframes(dict): A set of dataframes representing a schema, potentially including extra columns - - Returns: - tuple[str, dict]: - - A combined string representing (most of) a schema omn file. - - A of DF_SUFFIXES:str, representing each .tsv file in omn format. - """ - from hed.schema.hed_schema_io import from_dataframes - from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive - - annotation_terms, source_dict = get_prefixes(dataframes) - - # Load the schema, so we can save it out with ID's - schema = from_dataframes(dataframes) - schema2df = Schema2DF(get_as_ids=True) - output1 = schema2df.process_schema(schema, save_merged=False) - if hasattr(schema, "extras") and schema.extras: - output1.update(schema.extras) - # Convert dataframes to hedId format, and add any missing hedId's(generally, they should be replaced before here) - dataframes_u = update_dataframes_from_schema(dataframes, schema, get_as_ids=True) - - # Copy over remaining non schema dataframes. - for suffix in constants.DF_EXTRAS: - if suffix in dataframes: - dataframes_u[suffix] = dataframes[suffix] - - # Write out the new dataframes in omn format - annotation_props = _get_annotation_prop_ids(schema) - full_text = "" - omn_data = {} - for suffix, _dataframe in dataframes_u.items(): - if suffix in constants.DF_EXTRAS: - output_text = _convert_extra_df_to_omn(dataframes_u[suffix], suffix) - else: - output_text = _convert_df_to_omn( - dataframes_u[suffix], annotation_properties=annotation_props, annotation_terms=annotation_terms - ) - omn_data[suffix] = output_text - full_text += output_text + "\n" - - return full_text, omn_data - - -def _convert_df_to_omn(df, annotation_properties=("",), annotation_terms=None): - """Takes a single df format schema and converts it to omn. - - This is one section, e.g. tags, units, etc. - - Note: This mostly assumes a fully valid df. A df missing a required column will raise an error. - - Parameters: - df(pd.DataFrame): the dataframe to turn into omn - annotation_properties(dict): Known annotation properties, with the values being their hedId. - annotation_terms(dict): The list of valid external omn tags, such as "dc:source". - - Returns: - str: The omn formatted text for this section. - - """ - output_text = "" - for _index, row in df.iterrows(): - prop_type = _get_property_type(row) - hed_id = row[constants.hed_id] - output_text += f"{prop_type}: hed:{hed_id}\n" - output_text += _add_annotation_lines(row, annotation_properties, annotation_terms) - - if prop_type != "AnnotationProperty": - if constants.property_domain in row.index: - prop_domain = row[constants.property_domain] - output_text += "\tDomain:\n" - output_text += f"\t\t{prop_domain}\n" - if constants.property_range in row.index: - prop_range = row[constants.property_range] - output_text += "\tRange:\n" - output_text += f"\t\t{prop_range}\n" - output_text += "\n" - - if constants.equivalent_to in row.index: - equivalent_to = row[constants.equivalent_to] - equivalent_to = equivalent_to.replace(" and ", "\n\t\tand ") - subclass_of = row[constants.subclass_of] - if equivalent_to: - output_text += "\tEquivalentTo:\n" - output_text += f"\t\t{equivalent_to}" - else: - output_text += "\tSubClassOf:\n" - output_text += f"\t\t{subclass_of}" - output_text += "\n" - - output_text += "\n" - return output_text - - -def _convert_extra_df_to_omn(df, suffix): - """Takes a single df format schema and converts it to omn. - - This is one section, e.g. tags, units, etc. - - Note: This mostly assumes a fully valid df. A df missing a required column will raise an error. - - Parameters: - df(pd.DataFrame): the dataframe to turn into omn - suffix(dict): Known annotation properties, with the values being their hedId. - - Returns: - str: the omn formatted text for this section. - - """ - output_text = "" - for _index, row in df.iterrows(): - renamed_row = row.rename(index=constants.EXTRAS_CONVERSIONS) - if suffix == constants.PREFIXES_KEY: - output_text += f"Prefix: {renamed_row[constants.Prefix]} <{renamed_row[constants.namespace]}>" - elif suffix == constants.EXTERNAL_ANNOTATION_KEY: - output_text += f"AnnotationProperty: {renamed_row[constants.Prefix]}{renamed_row[constants.ID]}" - elif suffix == constants.SOURCES_KEY: - output_text += f"Source: {renamed_row[constants.source]}" - if renamed_row[constants.link]: - output_text += f" <{renamed_row[constants.link]}>" - if renamed_row[constants.description]: - output_text += f' "{renamed_row[constants.description]}"' - else: - raise ValueError(f"Unknown tsv suffix attempting to be converted {suffix}") - - output_text += "\n" - return output_text - - -def _split_on_unquoted_commas(input_string): - """Splits the given string into comma separated portions, ignoring commas inside double quotes. - - Parameters: - input_string: The string to split - - Returns: - list: The split apart string. - """ - # Note: does not handle escaped double quotes. - parts = [] - current = [] - in_quotes = False - - for char in input_string: - if char == '"': - in_quotes = not in_quotes - if char == "," and not in_quotes: - parts.append("".join(current).strip()) - current = [] - else: - current.append(char) - - if current: # Add the last part if there is any. - parts.append("".join(current).strip()) - - return parts - - -def _split_annotation_values(parts): - annotations = {} - for part in parts: - key, value = part.split(" ", 1) - annotations[key] = value - - return annotations - - -def _add_annotation_lines(row, annotation_properties, annotation_terms): - annotation_lines = [] - description = row[constants.dcdescription] - if description: - annotation_lines.append(f'\t\t{constants.dcdescription} "{description}"') - name = row[constants.name] - if name: - annotation_lines.append(f'\t\t{constants.name} "{name}"') - - # Add annotation properties(other than HedId) - attributes = get_attributes_from_row(row) - for attribute in attributes: - if attribute in annotation_properties and attribute != HedKey.HedID: - annotation_id = f"hed:{annotation_properties[attribute]}" - value = attributes[attribute] - if value is True: - value = "true" - else: - value = f'"{value}"' - annotation_lines.append(f"\t\t{annotation_id} {value}") - - # if constants.annotations in row.index: - # portions = _split_on_unquoted_commas(row[constants.annotations]) - # annotations = _split_annotation_values(portions) - # - # for key, value in annotations.items(): - # if key not in annotation_terms: - # raise ValueError(f"Problem. Found {key} which is not in the prefix/annotation list.") - # annotation_lines.append(f"\t\t{key} {value}") - - output_text = "" - if annotation_lines: - output_text += "\tAnnotations:\n" - output_text += ",\n".join(annotation_lines) - output_text += "\n" - - return output_text - - -def _get_property_type(row): - """Gets the property type from the row.""" - return row[constants.property_type] if constants.property_type in row.index else "Class" diff --git a/hed/scripts/create_ontology.py b/hed/scripts/create_ontology.py deleted file mode 100644 index 1d194272..00000000 --- a/hed/scripts/create_ontology.py +++ /dev/null @@ -1,64 +0,0 @@ -from hed.errors import HedFileError, get_printable_issue_string -from hed.schema.schema_io import load_dataframes -from hed.schema.schema_io.ontology_util import convert_df_to_omn -from hed.scripts.hed_script_util import get_prerelease_path, get_schema_filename -import argparse -import os - - -def create_ontology(repo_path, schema_name, schema_version, dest): - """Creates an ontology out of the given schema - - Parameters: - repo_path(str): the location of the hed-schemas folder relative to this one. Should point into the folder. - schema_name(str): The name of the schema we're interested in. "standard" for the standard schema - schema_version(str): The semantic version number - dest(str): Location for output - - Returns: - int: 0 on success. - - Raises: - HedFileError: An exception otherwise. - """ - final_source = get_prerelease_path(repo_path, schema_name, schema_version) - # print(f"Creating ontology from {final_source}") - - dataframes = load_dataframes(final_source) - try: - _, omn_dict = convert_df_to_omn(dataframes) - except HedFileError as e: - if e.issues: - print(get_printable_issue_string(e.issues, title="Issues converting schema:")) - raise e - - base = get_schema_filename(schema_name, schema_version) - output_dest = os.path.join(dest, base, "generated_omn") - os.makedirs(output_dest, exist_ok=True) - for suffix, omn_text in omn_dict.items(): - filename = os.path.join(output_dest, f"{base}_{suffix}.omn") - with open(filename, mode="w", encoding="utf-8") as opened_file: - opened_file.writelines(omn_text) - - return 0 - - -def main(): - parser = argparse.ArgumentParser(description="Convert a specified schema in the prerelease folder to an ontology.") - parser.add_argument("repo_path", help="The location of the hed-schemas directory") - parser.add_argument("schema_name", help='The name of the schema to convert("standard" for standard schema)') - parser.add_argument("schema_version", help="The schema version to modify") - parser.add_argument("--dest", default=os.path.join("src", "ontology"), help="The base location to save to") - - args = parser.parse_args() - - repo_path = args.repo_path - schema_name = args.schema_name - schema_version = args.schema_version - dest = args.dest - - return create_ontology(repo_path, schema_name, schema_version, dest) - - -if __name__ == "__main__": - exit(main()) diff --git a/hed/scripts/hed_convert_schema.py b/hed/scripts/hed_convert_schema.py index 720b2693..3e0a1b31 100644 --- a/hed/scripts/hed_convert_schema.py +++ b/hed/scripts/hed_convert_schema.py @@ -1,6 +1,6 @@ from hed.scripts.hed_script_util import sort_base_schemas, validate_all_schemas, add_extension from hed.schema.schema_io import load_dataframes, save_dataframes -from hed.schema.schema_io.ontology_util import update_dataframes_from_schema +from hed.schema.schema_io.hed_id_util import update_dataframes_from_schema from hed.schema.hed_schema_io import load_schema, from_dataframes from hed.errors import get_printable_issue_string, HedFileError import argparse diff --git a/pyproject.toml b/pyproject.toml index ff9aea88..9056a33c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,6 @@ hed_extract_bids_sidecar = "hed.scripts.hed_extract_bids_sidecar:main" hed_validate_schemas = "hed.scripts.validate_schemas:main" hed_update_schemas = "hed.scripts.hed_convert_schema:main" hed_add_ids = "hed.scripts.add_hed_ids:main" -hed_create_ontology = "hed.scripts.create_ontology:main" [tool.setuptools_scm] write_to = "hed/_version.py" diff --git a/tests/schema/test_ontology_util.py b/tests/schema/test_hed_id_util.py similarity index 75% rename from tests/schema/test_ontology_util.py rename to tests/schema/test_hed_id_util.py index af799302..5b9f5236 100644 --- a/tests/schema/test_ontology_util.py +++ b/tests/schema/test_hed_id_util.py @@ -2,12 +2,11 @@ import pandas as pd from hed import HedFileError -from hed.schema.schema_io import ontology_util, df_util, df_constants as constants -from hed.schema.schema_io.ontology_util import ( +from hed.schema.schema_io import hed_id_util, df_util, df_constants as constants +from hed.schema.schema_io.hed_id_util import ( _verify_hedid_matches, assign_hed_ids_section, get_all_ids, - convert_df_to_omn, update_dataframes_from_schema, ) from hed.schema.schema_io.df_util import get_library_name_and_id @@ -41,19 +40,19 @@ def test_get_library_name_and_id_unknown(self): self.assertEqual(first_id, df_util.UNKNOWN_LIBRARY_VALUE) def test_get_hedid_range_normal_case(self): - id_set = ontology_util._get_hedid_range("score", constants.DATA_KEY) + id_set = hed_id_util._get_hedid_range("score", constants.DATA_KEY) self.assertTrue(40401 in id_set) self.assertEqual(len(id_set), 200) # Check the range size def test_get_hedid_range_boundary(self): # Test boundary condition where end range is -1 - id_set = ontology_util._get_hedid_range("score", constants.TAG_KEY) + id_set = hed_id_util._get_hedid_range("score", constants.TAG_KEY) self.assertTrue(42001 in id_set) self.assertEqual(len(id_set), 18000 - 1) # From 42001 to 60000 def test_get_hedid_range_error(self): with self.assertRaises(NotImplementedError): - ontology_util._get_hedid_range("lang", constants.STRUCT_KEY) + hed_id_util._get_hedid_range("lang", constants.STRUCT_KEY) class TestVerifyHedIdMatches(unittest.TestCase): @@ -62,36 +61,36 @@ def setUp(self): def test_no_hedid(self): df = pd.DataFrame([{"rdfs:label": "Event", "hedId": ""}, {"rdfs:label": "Age-#", "hedId": ""}]) - errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(self.schema_82.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 0) def test_id_matches(self): df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_0012001"}, {"rdfs:label": "Age-#", "hedId": "HED_0012475"}]) - errors = _verify_hedid_matches(hed_schema_global.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(hed_schema_global.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 0) def test_label_mismatch_id(self): df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_0012005"}, {"rdfs:label": "Age-#", "hedId": "HED_0012007"}]) - errors = _verify_hedid_matches(hed_schema_global.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(hed_schema_global.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 2) def test_label_no_entry(self): df = pd.DataFrame([{"rdfs:label": "NotARealEvent", "hedId": "does_not_matter"}]) - errors = _verify_hedid_matches(hed_schema_global.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(hed_schema_global.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 1) def test_out_of_range(self): df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_0000000"}]) - errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(self.schema_82.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 1) def test_not_int(self): df = pd.DataFrame([{"rdfs:label": "Event", "hedId": "HED_AAAAAAA"}]) - errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + errors = _verify_hedid_matches(self.schema_82.tags, df, hed_id_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 1) def test_get_all_ids_exists(self): @@ -152,24 +151,5 @@ def test_update_dataframes_from_schema(self): self.assertEqual(len(e.issues), 115) -class TestConvertOmn(unittest.TestCase): - def test_convert_df_to_omn(self): - dataframes = hed_schema_global.get_as_dataframes() - omn_version, _ = convert_df_to_omn(dataframes) - - # make these more robust, for now just verify it's somewhere in the result - for df_name, df in dataframes.items(): - if df_name == constants.STRUCT_KEY or "rdfs:label" not in df.columns: - continue # Not implemented yet - for label in df["rdfs:label"]: - # Verify that the label is somewhere in the OMN text - error = f"Label '{label}' from dataframe '{df_name}' was not found in the OMN output." - label_key = f'rdfs:label "{label}"' - self.assertIn(label_key, omn_version, error) - - for hed_id in df[constants.hed_id]: - if df_name == constants.STRUCT_KEY: - continue # Not implemented yet - base_id = f": hed:{hed_id}" - error = f"HedId '{base_id}' from dataframe '{df_name}' was not found in the OMN output." - self.assertIn(base_id, omn_version, error) +if __name__ == "__main__": + unittest.main()