diff --git a/docs/sources/user_guide/utils/serialization.ipynb b/docs/sources/user_guide/utils/serialization.ipynb new file mode 100644 index 000000000..bf96933b4 --- /dev/null +++ b/docs/sources/user_guide/utils/serialization.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "7ca989ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Functions imported correctly!\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import sys\n", + "import os\n", + "\n", + "path = os.path.abspath(os.path.join(os.getcwd(), \"..\", \"..\", \"..\", \"..\"))\n", + "if path not in sys.path:\n", + " sys.path.insert(0, path)\n", + "\n", + "from mlxtend.utils import save_model_to_json, load_model_from_json\n", + "print(\"Functions imported correctly!\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f29bed8", + "metadata": {}, + "source": [ + "# Serialization - JSON Utilities\n", + "\n", + "The `serialization` module contains utility functions to export and import models in a human-readable JSON format.\n", + "\n", + "> from mlxtend.utils import save_model_to_json\n", + "> from mlxtend.utils import load_model_from_json\n", + "\n", + "## Overview\n", + "\n", + "While Python's `pickle` is a common way to serialize models, it has security risks and versioning issues. The JSON serialization utilities in `mlxtend` provide a transparent alternative that:\n", + "\n", + "1. Saves model parameters and fitted attributes in a readable text format.\n", + "2. Handles NumPy arrays and specialized types automatically.\n", + "3. Allows model reconstruction without manual class instantiation." + ] + }, + { + "cell_type": "markdown", + "id": "bdff3eac", + "metadata": {}, + "source": [ + "## Example 1 - Saving and Loading a Perceptron\n", + "\n", + "This example demonstrates training a simple `Perceptron` classifier, saving it to a JSON file, and then loading it back to verify that the state is preserved." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "182144de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original model weights: [[0.2 0.2]]\n", + "\n", + "[Model saved to model_data.json]\n", + "Loaded model weights: [[0.2 0.2]]\n", + "\n", + " Predictions and weights are identical!\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import Perceptron\n", + "import numpy as np\n", + "import os\n", + "\n", + "X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])\n", + "y = np.array([0, 0, 0, 1])\n", + "\n", + "ppn = Perceptron(eta0=0.1, random_state=1)\n", + "ppn.fit(X, y)\n", + "\n", + "print(\"Original model weights:\", ppn.coef_)\n", + "\n", + "save_model_to_json(ppn, 'model_data.json')\n", + "print(\"\\n[Model saved to model_data.json]\")\n", + "\n", + "ppn_loaded = load_model_from_json('model_data.json')\n", + "\n", + "print(\"Loaded model weights: \", ppn_loaded.coef_)\n", + "\n", + "if np.array_equal(ppn.coef_, ppn_loaded.coef_):\n", + " print(\"\\n Predictions and weights are identical!\")\n", + "\n", + "if os.path.exists('model_data.json'):\n", + " os.remove('model_data.json')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mlxtend/utils/__init__.py b/mlxtend/utils/__init__.py index f68e6b8af..8802e1d20 100644 --- a/mlxtend/utils/__init__.py +++ b/mlxtend/utils/__init__.py @@ -4,8 +4,17 @@ # # License: BSD 3 clause + from .checking import check_Xy, format_kwarg_dictionaries from .counter import Counter +from .serialization import load_model_from_json, save_model_to_json from .testing import assert_raises -__all__ = ["Counter", "assert_raises", "check_Xy", "format_kwarg_dictionaries"] +__all__ = [ + "Counter", + "assert_raises", + "check_Xy", + "format_kwarg_dictionaries", + "save_model_to_json", + "load_model_from_json", +] diff --git a/mlxtend/utils/serialization.py b/mlxtend/utils/serialization.py new file mode 100644 index 000000000..81150a000 --- /dev/null +++ b/mlxtend/utils/serialization.py @@ -0,0 +1,49 @@ +import importlib +import json + +import numpy as np + + +class MlxtendEncoder(json.JSONEncoder): + """Custom JSON encoder to handle numpy types and fallback to strings.""" + + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + if isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + try: + return super(MlxtendEncoder, self).default(obj) + except TypeError: + return str(obj) + + +def save_model_to_json(model, filename): + """Save an mlxtend estimator to a JSON file.""" + model_data = model.__dict__.copy() + model_data["__module__"] = model.__class__.__module__ + model_data["__class__"] = model.__class__.__name__ + + with open(filename, "w") as f: + json.dump(model_data, f, cls=MlxtendEncoder, indent=4) + + +def load_model_from_json(filename): + """Load an mlxtend estimator from a JSON file.""" + with open(filename, "r") as f: + model_data = json.load(f) + module_name = model_data.pop("__module__") + class_name = model_data.pop("__class__") + + module = importlib.import_module(module_name) + class_ = getattr(module, class_name) + model_instance = class_() + + for key, value in model_data.items(): + if isinstance(value, list) and key.endswith("_"): + setattr(model_instance, key, np.array(value)) + else: + setattr(model_instance, key, value) + return model_instance diff --git a/mlxtend/utils/tests/test_serialization.py b/mlxtend/utils/tests/test_serialization.py new file mode 100644 index 000000000..4130d339e --- /dev/null +++ b/mlxtend/utils/tests/test_serialization.py @@ -0,0 +1,42 @@ +import os + +import numpy as np +import pytest + +from mlxtend.classifier import Perceptron +from mlxtend.utils.serialization import load_model_from_json, save_model_to_json + + +def test_serialization_perceptron(): + X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) + y = np.array([0, 1, 1, 1]) + + ppn = Perceptron(epochs=5, eta=0.1) + ppn.fit(X, y) + + filename = "temp_ppn_model.json" + + try: + save_model_to_json(ppn, filename) + assert os.path.exists(filename) + + ppn_loaded = load_model_from_json(filename) + + assert ppn.__class__ == ppn_loaded.__class__ + np.testing.assert_array_almost_equal(ppn.w_, ppn_loaded.w_) + orig_pred = ppn.predict(X) + load_pred = ppn_loaded.predict(X) + np.testing.assert_array_equal(orig_pred, load_pred) + finally: + if os.path.exists(filename): + os.remove(filename) + + +def test_encoder_fallback(): + import json + + from mlxtend.utils.serialization import MlxtendEncoder + + data = {"complex_obj": open} + encoded = json.dumps(data, cls=MlxtendEncoder) + assert "built-in function open" in encoded