From afc55265dbe6708d6ff1b192f0fd6dcbd73a55f7 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Thu, 9 Mar 2023 16:41:42 -0800 Subject: [PATCH 1/5] add validation funcs for JSON api to convert to dict/list --- srsly/_json_api.py | 68 +++++++++++++++++++++++++++++++++++- srsly/tests/test_json_api.py | 42 ++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/srsly/_json_api.py b/srsly/_json_api.py index 24d25fd..eefe244 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -1,4 +1,4 @@ -from typing import Union, Iterable, Sequence, Any, Optional, Iterator +from typing import Any, Iterable, Dict, List, Optional, Iterator, Union, Type, cast import sys import json as _builtin_json import gzip @@ -39,6 +39,32 @@ def json_loads(data: Union[str, bytes]) -> JSONOutput: return ujson.loads(data) +def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]: + """Deserialize unicode or bytes to a Python dict. + + data (str / bytes): The data to deserialize. + RAISES: ValueError if the loaded data is not a dict + RETURNS: The deserialized Python dict. + """ + obj = json_loads(data) + if not isinstance(obj, dict): + raise ValueError("JSON data could not be parsed to a dict.") + return obj + + +def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, Any]]: + """Deserialize unicode or bytes to a Python list of dicts. + + data (str / bytes): The data to deserialize. + RAISES: ValueError if the loaded data is not a list + RETURNS: The deserialized Python list. + """ + loaded = json_loads(data) + if not isinstance(loaded, list): + raise ValueError("JSON data could not be parsed to a list of dicts.") + return loaded + + def read_json(path: FilePath) -> JSONOutput: """Load JSON from file or standard input. @@ -53,6 +79,30 @@ def read_json(path: FilePath) -> JSONOutput: return ujson.load(f) +def read_json_dict(path: FilePath) -> Dict[str, Any]: + """Load JSON from file or standard input. + + path (FilePath): The file path. "-" for reading from stdin. + RETURNS (JSONOutput): The loaded JSON content. + """ + data = read_json(path) + if not isinstance(data, dict): + raise ValueError("Invalid JSON, data could not be parsed to a dict.") + return data + + +def read_json_list(path: FilePath) -> List[Dict[str, Any]]: + """Load JSON from file or standard input. + + path (FilePath): The file path. "-" for reading from stdin. + RETURNS (JSONOutput): The loaded JSON content. + """ + data = read_json(path) + if not isinstance(data, list): + raise ValueError("Invalid JSON, data could not be parsed to a list of dicts.") + return data + + def read_gzip_json(path: FilePath) -> JSONOutput: """Load JSON from a gzipped file. @@ -149,6 +199,22 @@ def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]: yield line +def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, Any]]: + """Read a .jsonl file or standard input and yield contents line by line. + Blank lines will always be skipped. Validates the contents of each line is a dict. + + path (FilePath): The file path. "-" for reading from stdin. + skip (bool): Skip broken lines and don't raise ValueError. + YIELDS (JSONOutput): The loaded JSON contents of each line. + """ + for i, line in enumerate(read_jsonl(path, skip=skip)): + if not isinstance(line, dict): + if skip: + continue + raise ValueError(f"Invalid JSON Object on line: {i + 1}. Line is not a valid dict.") + yield line + + def write_jsonl( path: FilePath, lines: Iterable[JSONInput], diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py index 89ce400..990d515 100644 --- a/srsly/tests/test_json_api.py +++ b/srsly/tests/test_json_api.py @@ -4,8 +4,13 @@ import gzip import numpy +from typing import Any, Dict, List, Union + from .._json_api import ( read_json, + read_json_dict, + read_json_list, + read_jsonl_dicts, write_json, read_jsonl, write_jsonl, @@ -262,3 +267,40 @@ def test_read_jsonl_gzip(): assert len(data[1]) == 1 assert data[0]["hello"] == "world" assert data[1]["test"] == 123 + + +READ_JSON_DICT_TEST_CASES = { + +} + + +READ_JSONL_DICT_TEST_CASES = { + "invalid_str": ('"test"', ValueError()), + "invalid_num": ('-32', ValueError()), + "invalid_json_list": ('[{"hello": "world"}\n{"test": 123}]', ValueError()), + "valid_dicts": ('{"hello": "world"}\n{"test": 123}', [{"hello": "world"}, {"test": 123}]), +} + +@pytest.mark.parametrize( + "file_contents, expected", + READ_JSONL_DICT_TEST_CASES.values(), + ids=READ_JSONL_DICT_TEST_CASES.keys() +) +def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, Any]], ValueError]): + + with make_tempdir({"tmp.json": file_contents}) as temp_dir: + file_path = temp_dir / "tmp.json" + assert file_path.exists() + data = read_jsonl_dicts(file_path) + # Make sure this returns a generator, not just a list + assert not hasattr(data, "__len__") + try: + # actually consume the generator to trigger errors + data = list(data) + except ValueError: + assert isinstance(expected, ValueError) + else: + assert isinstance(expected, list) + assert len(data) == len(expected) + for data_item, expected_item in zip(data, expected): + assert data_item == expected_item From 4e62665ae9dfbb257989bf38c72857497405472a Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Thu, 9 Mar 2023 16:56:44 -0800 Subject: [PATCH 2/5] add per index validation to read_json_list --- srsly/_json_api.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/srsly/_json_api.py b/srsly/_json_api.py index eefe244..fd9f231 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -91,15 +91,25 @@ def read_json_dict(path: FilePath) -> Dict[str, Any]: return data -def read_json_list(path: FilePath) -> List[Dict[str, Any]]: +def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: bool = False) -> List[Dict[str, Any]]: """Load JSON from file or standard input. path (FilePath): The file path. "-" for reading from stdin. RETURNS (JSONOutput): The loaded JSON content. """ + data = read_json(path) + err_msg = "Invalid JSON, data could not be parsed to a list of dicts." if not isinstance(data, list): - raise ValueError("Invalid JSON, data could not be parsed to a list of dicts.") + raise ValueError(err_msg) + + output = [] + for i, obj in enumerate(data): + if not isinstance(obj, dict): + if skip_invalid: + continue + raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.") + output.append(obj) return data From 70e884624dc362304f67b9af4300a79247c2d8f4 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Fri, 10 Mar 2023 09:44:04 -0800 Subject: [PATCH 3/5] fix inner validation for read_json_list --- srsly/_json_api.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/srsly/_json_api.py b/srsly/_json_api.py index fd9f231..c3b5492 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -99,17 +99,19 @@ def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: b """ data = read_json(path) - err_msg = "Invalid JSON, data could not be parsed to a list of dicts." if not isinstance(data, list): - raise ValueError(err_msg) - - output = [] - for i, obj in enumerate(data): - if not isinstance(obj, dict): - if skip_invalid: - continue - raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.") - output.append(obj) + raise ValueError("Invalid JSON, data could not be parsed to a list of dicts.") + + if validate_inner: + output = [] + for i, obj in enumerate(data): + if not isinstance(obj, dict): + if skip_invalid: + continue + raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.") + output.append(obj) + else: + output = data return data From 47dcc486d94bddee575001bcb260c27154f4c9fd Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Tue, 14 Mar 2023 16:32:01 -0700 Subject: [PATCH 4/5] add separate read_json_list_of_dicts and rm validate_inner idea. Don't say invalid JSON, just invalid type --- srsly/_json_api.py | 38 +++++++++++++++++++++++------------- srsly/tests/test_json_api.py | 11 +++-------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/srsly/_json_api.py b/srsly/_json_api.py index c3b5492..8edc335 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -87,11 +87,11 @@ def read_json_dict(path: FilePath) -> Dict[str, Any]: """ data = read_json(path) if not isinstance(data, dict): - raise ValueError("Invalid JSON, data could not be parsed to a dict.") + raise ValueError("JSON data could not be parsed to a dict.") return data -def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: bool = False) -> List[Dict[str, Any]]: +def read_json_list(path: FilePath) -> List[JSONOutput]: """Load JSON from file or standard input. path (FilePath): The file path. "-" for reading from stdin. @@ -100,21 +100,31 @@ def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: b data = read_json(path) if not isinstance(data, list): - raise ValueError("Invalid JSON, data could not be parsed to a list of dicts.") - - if validate_inner: - output = [] - for i, obj in enumerate(data): - if not isinstance(obj, dict): - if skip_invalid: - continue - raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.") - output.append(obj) - else: - output = data + raise ValueError("JSON data could not be parsed to a list.") return data + +def read_json_list_of_dicts(path: FilePath, skip_invalid: bool = False) -> List[Dict[str, Any]]: + """Load JSON from file or standard input. + + path (FilePath): The file path. "-" for reading from stdin. + RETURNS (JSONOutput): The loaded JSON content. + """ + + data = read_json(path) + if not isinstance(data, list): + raise ValueError("JSON data could not be parsed to a list.") + output = [] + for i, obj in enumerate(data): + if not isinstance(obj, dict): + if skip_invalid: + continue + raise ValueError(f"JSON object at index: {i + 1} of list could not be parsed to a valid dict.") + output.append(obj) + return output + + def read_gzip_json(path: FilePath) -> JSONOutput: """Load JSON from a gzipped file. diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py index 990d515..13d8889 100644 --- a/srsly/tests/test_json_api.py +++ b/srsly/tests/test_json_api.py @@ -269,12 +269,7 @@ def test_read_jsonl_gzip(): assert data[1]["test"] == 123 -READ_JSON_DICT_TEST_CASES = { - -} - - -READ_JSONL_DICT_TEST_CASES = { +READ_JSONL_DICTS_TEST_CASES = { "invalid_str": ('"test"', ValueError()), "invalid_num": ('-32', ValueError()), "invalid_json_list": ('[{"hello": "world"}\n{"test": 123}]', ValueError()), @@ -283,8 +278,8 @@ def test_read_jsonl_gzip(): @pytest.mark.parametrize( "file_contents, expected", - READ_JSONL_DICT_TEST_CASES.values(), - ids=READ_JSONL_DICT_TEST_CASES.keys() + READ_JSONL_DICTS_TEST_CASES.values(), + ids=READ_JSONL_DICTS_TEST_CASES.keys() ) def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, Any]], ValueError]): From d174d885e3787d83cc3aaedb56bdcf845c80a31e Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Tue, 14 Mar 2023 16:34:02 -0700 Subject: [PATCH 5/5] fix some comments --- srsly/_json_api.py | 14 +++++++------- srsly/tests/test_json_api.py | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/srsly/_json_api.py b/srsly/_json_api.py index 8edc335..46d8140 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -39,7 +39,7 @@ def json_loads(data: Union[str, bytes]) -> JSONOutput: return ujson.loads(data) -def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]: +def json_loads_dict(data: Union[str, bytes]) -> Dict[str, JSONOutput]: """Deserialize unicode or bytes to a Python dict. data (str / bytes): The data to deserialize. @@ -52,7 +52,7 @@ def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]: return obj -def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, Any]]: +def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, JSONOutput]]: """Deserialize unicode or bytes to a Python list of dicts. data (str / bytes): The data to deserialize. @@ -79,7 +79,7 @@ def read_json(path: FilePath) -> JSONOutput: return ujson.load(f) -def read_json_dict(path: FilePath) -> Dict[str, Any]: +def read_json_dict(path: FilePath) -> Dict[str, JSONOutput]: """Load JSON from file or standard input. path (FilePath): The file path. "-" for reading from stdin. @@ -92,7 +92,7 @@ def read_json_dict(path: FilePath) -> Dict[str, Any]: def read_json_list(path: FilePath) -> List[JSONOutput]: - """Load JSON from file or standard input. + """Load JSON from file or standard input. Parse as a list path (FilePath): The file path. "-" for reading from stdin. RETURNS (JSONOutput): The loaded JSON content. @@ -105,8 +105,8 @@ def read_json_list(path: FilePath) -> List[JSONOutput]: -def read_json_list_of_dicts(path: FilePath, skip_invalid: bool = False) -> List[Dict[str, Any]]: - """Load JSON from file or standard input. +def read_json_list_of_dicts(path: FilePath, skip_invalid: bool = False) -> List[Dict[str, JSONOutput]]: + """Load JSON from file or standard input. Parse as list of dicts path (FilePath): The file path. "-" for reading from stdin. RETURNS (JSONOutput): The loaded JSON content. @@ -221,7 +221,7 @@ def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]: yield line -def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, Any]]: +def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, JSONOutput]]: """Read a .jsonl file or standard input and yield contents line by line. Blank lines will always be skipped. Validates the contents of each line is a dict. diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py index 13d8889..1764c83 100644 --- a/srsly/tests/test_json_api.py +++ b/srsly/tests/test_json_api.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union from .._json_api import ( + JSONOutput, read_json, read_json_dict, read_json_list, @@ -16,6 +17,7 @@ write_jsonl, read_gzip_jsonl, write_gzip_jsonl, + ) from .._json_api import write_gzip_json, json_dumps, is_json_serializable from .._json_api import json_loads @@ -281,7 +283,7 @@ def test_read_jsonl_gzip(): READ_JSONL_DICTS_TEST_CASES.values(), ids=READ_JSONL_DICTS_TEST_CASES.keys() ) -def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, Any]], ValueError]): +def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, JSONOutput]], ValueError]): with make_tempdir({"tmp.json": file_contents}) as temp_dir: file_path = temp_dir / "tmp.json"