-
Notifications
You must be signed in to change notification settings - Fork 27
Bug: parsing annotations that failed to download crashes #655
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -30,15 +30,21 @@ | |||||
| from dagshub.common.rich_util import get_rich_progress | ||||||
| from dagshub.common.util import lazy_load, multi_urljoin | ||||||
| from dagshub.data_engine.annotation import MetadataAnnotations | ||||||
| from dagshub.data_engine.annotation.metadata import UnsupportedMetadataAnnotations | ||||||
| from dagshub.data_engine.annotation.metadata import ErrorMetadataAnnotations, UnsupportedMetadataAnnotations | ||||||
| from dagshub.data_engine.annotation.voxel_conversion import ( | ||||||
| add_ls_annotations, | ||||||
| add_voxel_annotations, | ||||||
| ) | ||||||
| from dagshub.data_engine.client.loaders.base import DagsHubDataset | ||||||
| from dagshub.data_engine.client.models import DatasourceType, MetadataSelectFieldSchema | ||||||
| from dagshub.data_engine.dtypes import MetadataFieldType | ||||||
| from dagshub.data_engine.model.datapoint import Datapoint, _generated_fields, _get_blob | ||||||
| from dagshub.data_engine.model.datapoint import ( | ||||||
| BlobDownloadError, | ||||||
| BlobHashMetadata, | ||||||
| Datapoint, | ||||||
| _generated_fields, | ||||||
| _get_blob, | ||||||
| ) | ||||||
| from dagshub.data_engine.model.schema_util import dacite_config | ||||||
| from dagshub.data_engine.voxel_plugin_server.utils import set_voxel_envvars | ||||||
|
|
||||||
|
|
@@ -390,10 +396,9 @@ def get_blob_fields( | |||||
| for dp in self.entries: | ||||||
| for fld in fields: | ||||||
| field_value = dp.metadata.get(fld) | ||||||
| # If field_value is a blob or a path, then ignore, means it's already been downloaded | ||||||
| if not isinstance(field_value, str): | ||||||
| if not isinstance(field_value, BlobHashMetadata): | ||||||
| continue | ||||||
| download_task = (dp, fld, dp.blob_url(field_value), dp.blob_cache_location / field_value) | ||||||
| download_task = (dp, fld, dp.blob_url(field_value.hash), dp.blob_cache_location / field_value.hash) | ||||||
| to_download.append(download_task) | ||||||
|
|
||||||
| progress = get_rich_progress(rich.progress.MofNCompleteColumn()) | ||||||
|
|
@@ -403,8 +408,6 @@ def get_blob_fields( | |||||
|
|
||||||
| def _get_blob_fn(dp: Datapoint, field: str, url: str, blob_path: Path): | ||||||
| blob_or_path = _get_blob(url, blob_path, auth, cache_on_disk, load_into_memory, path_format) | ||||||
| if isinstance(blob_or_path, str) and path_format != "str": | ||||||
| logger.warning(f"Error while downloading blob for field {field} in datapoint {dp.path}:{blob_or_path}") | ||||||
| dp.metadata[field] = blob_or_path | ||||||
|
|
||||||
| with progress: | ||||||
|
|
@@ -416,7 +419,7 @@ def _get_blob_fn(dp: Datapoint, field: str, url: str, blob_path: Path): | |||||
| logger.warning(f"Got exception {type(exc)} while downloading blob: {exc}") | ||||||
| progress.update(task, advance=1) | ||||||
|
|
||||||
| self._convert_annotation_fields(*fields, load_into_memory=load_into_memory) | ||||||
| self._convert_annotation_fields(*fields) | ||||||
|
|
||||||
| # Convert any downloaded document fields | ||||||
| document_fields = [f for f in fields if f in self.document_fields] | ||||||
|
|
@@ -425,18 +428,17 @@ def _get_blob_fn(dp: Datapoint, field: str, url: str, blob_path: Path): | |||||
| if document_fields: | ||||||
| for dp in self: | ||||||
| for fld in document_fields: | ||||||
| if fld in dp.metadata: | ||||||
| # Defensive check to not mangle annotation fields by accident | ||||||
| if isinstance(dp.metadata[fld], MetadataAnnotations): | ||||||
| continue | ||||||
| # Force load the content into memory, even if load_into_memory was set to False | ||||||
| if not load_into_memory or isinstance(dp.metadata[fld], Path): | ||||||
| dp.metadata[fld] = Path(dp.metadata[fld]).read_bytes() | ||||||
| dp.metadata[fld] = dp.metadata[fld].decode("utf-8") | ||||||
| if fld not in dp.metadata: | ||||||
| continue | ||||||
| try: | ||||||
| content = dp.get_blob(fld) | ||||||
| dp.metadata[fld] = content.decode("utf-8") | ||||||
| except BlobDownloadError as e: | ||||||
| logger.warning(f"Failed to download document field '{fld}' for datapoint '{dp.path}': {e}") | ||||||
|
|
||||||
| return self | ||||||
|
|
||||||
| def _convert_annotation_fields(self, *fields, load_into_memory): | ||||||
| def _convert_annotation_fields(self, *fields): | ||||||
| # Convert any downloaded annotation column | ||||||
| annotation_fields = [f for f in fields if f in self.annotation_fields] | ||||||
| if not annotation_fields: | ||||||
|
|
@@ -448,7 +450,7 @@ def _convert_annotation_fields(self, *fields, load_into_memory): | |||||
| for dp in self: | ||||||
| for fld in annotation_fields: | ||||||
| metadata_value = dp.metadata.get(fld) | ||||||
| # No value - create ampty annotation container | ||||||
| # No value - create empty annotation container | ||||||
| if metadata_value is None: | ||||||
| dp.metadata[fld] = MetadataAnnotations(datapoint=dp, field=fld) | ||||||
| continue | ||||||
|
|
@@ -457,16 +459,17 @@ def _convert_annotation_fields(self, *fields, load_into_memory): | |||||
| continue | ||||||
| # Parse annotation from the content of the field | ||||||
| else: | ||||||
| # Force load the content into memory, even if load_into_memory was set to False | ||||||
| if not load_into_memory or isinstance(dp.metadata[fld], Path): | ||||||
| metadata_value = Path(metadata_value).read_bytes() | ||||||
| try: | ||||||
| annotation_content = dp.get_blob(fld) | ||||||
| dp.metadata[fld] = MetadataAnnotations.from_ls_task( | ||||||
| datapoint=dp, field=fld, ls_task=metadata_value | ||||||
| datapoint=dp, field=fld, ls_task=annotation_content | ||||||
|
Comment on lines
+463
to
+465
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't it weird for the dp.get_blob to convert to ls_task (bytes), then reconvert it to MetadataAnnotations here? Or is this for the scenario where dp.get_blob returns bytes directly without going through MetadataAnnotations (loading from disk IIUC)? Feels clumsy, like maybe converting to MetadataAnnotations should happen only here or in dp.get_blob but why in both? In which scenario will
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The intentional data flow here is:
The reason This part will not be hit if you, for example, try to load and convert an annotation field a second time, because at that point, the conversion would have already happened, and then this type guard prevents doing a double conversion: client/dagshub/data_engine/model/query_result.py Lines 458 to 459 in 7c8fa55
|
||||||
| ) | ||||||
| except BlobDownloadError as e: | ||||||
| dp.metadata[fld] = ErrorMetadataAnnotations(datapoint=dp, field=fld, error_message=e.message) | ||||||
| bad_annotations[fld].append(dp.path) | ||||||
| except ValidationError: | ||||||
| dp.metadata[fld] = UnsupportedMetadataAnnotations( | ||||||
| datapoint=dp, field=fld, original_value=metadata_value | ||||||
| datapoint=dp, field=fld, original_value=annotation_content | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't it dangerous to use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| ) | ||||||
| bad_annotations[fld].append(dp.path) | ||||||
|
|
||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.