diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 20c39ed52639d..e7433788de334 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -41,6 +41,7 @@ interpreter/cling/tools/packaging @vgvassilev /tree/ @pcanal /tutorials/ @couet /tree/dataframe/ @martamaja10 @vepadulano +/tree/ml/ @vepadulano @siliataider /tree/readspeed/ @martamaja10 /tree/ntuple/ @jblomer /tree/ntuplebrowse/ @jblomer diff --git a/bindings/pyroot/pythonizations/python/CMakeLists.txt b/bindings/pyroot/pythonizations/python/CMakeLists.txt index 0ea818af2f146..66a6adf88a1f2 100644 --- a/bindings/pyroot/pythonizations/python/CMakeLists.txt +++ b/bindings/pyroot/pythonizations/python/CMakeLists.txt @@ -16,7 +16,8 @@ if(dataframe) ROOT/_pythonization/_rdf_conversion_maps.py ROOT/_pythonization/_rdf_pyz.py ROOT/_pythonization/_rdisplay.py - ROOT/_pythonization/_rdf_namespace.py) + ROOT/_pythonization/_rdf_namespace.py + ROOT/_pythonization/_ml_dataloader.py) endif() if(roofit) @@ -82,10 +83,6 @@ if(tmva) ROOT/_pythonization/_tmva/_sofie/_parser/_keras/layers/softmax.py ROOT/_pythonization/_tmva/_sofie/_parser/_keras/layers/swish.py ROOT/_pythonization/_tmva/_sofie/_parser/_keras/layers/tanh.py) - if(dataframe) - list(APPEND PYROOT_EXTRA_PYTHON_SOURCES - ROOT/_pythonization/_tmva/_batchgenerator.py) - endif() endif() set(py_sources diff --git a/bindings/pyroot/pythonizations/python/ROOT/_facade.py b/bindings/pyroot/pythonizations/python/ROOT/_facade.py index b745e08b49f06..6672075702b00 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_facade.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_facade.py @@ -120,7 +120,6 @@ def _fallback_getattr(self, name): raise AttributeError("Failed to get attribute {} from ROOT".format(name)) def _register_converters_and_executors(self): - converter_aliases = { "Float16_t": "float", "const Float16_t&": "const float&", @@ -448,11 +447,9 @@ def TMVA(self): hasRDF = "dataframe" in self.gROOT.GetConfigFeatures() if hasRDF: try: - from ._pythonization._tmva import inject_rbatchgenerator from ._pythonization._tmva._rtensor import _AsRTensor from ._pythonization._tmva._tree_inference import SaveXGBoost - inject_rbatchgenerator(ns) ns.Experimental.AsRTensor = _AsRTensor ns.Experimental.SaveXGBoost = SaveXGBoost except Exception: @@ -505,3 +502,13 @@ def uhi(self): except ImportError: raise Exception("Failed to pythonize the namespace uhi") return uhi_module + + @property + def Experimental(self): + ns = self._fallback_getattr("Experimental") + + from ._pythonization._ml_dataloader import _inject_dataloader_api + + _inject_dataloader_api(ns.ML) + + return ns diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py similarity index 94% rename from bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py rename to bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py index b42bbec3c9773..3e0030d243d28 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py @@ -33,7 +33,7 @@ def get_template( max_vec_sizes: dict[str, int] = dict(), ) -> Tuple[str, list[int]]: """ - Generate a template for the RBatchGenerator based on the given + Generate a template for the DataLoader based on the given RDataFrame and columns. Args: @@ -44,7 +44,7 @@ def get_template( max_vec_sizes (list[int]): The length of each vector based column. Returns: - template (str): Template for the RBatchGenerator + template (str): Template for the DataLoader """ if not columns: @@ -102,7 +102,7 @@ def __init__( sampling_ratio: float = 1.0, replacement: bool = False, ): - """Wrapper around the Cpp RBatchGenerator + """Wrapper around the Cpp DataLoader Args: rdataframe (RNode): Name of RNode object. @@ -238,15 +238,15 @@ def __init__( self.train_columns = [c for c in self.all_columns if c not in self.target_columns + [self.weights_column]] - from ROOT import TMVA, EnableThreadSafety + import ROOT - # The RBatchGenerator will create a separate C++ thread for I/O. + # The DataLoader will create a separate C++ thread for I/O. # Enable thread safety in ROOT from here, to make sure there is no # interference between the main Python thread (which might call into # cling via cppyy) and the I/O thread. - EnableThreadSafety() + ROOT.EnableThreadSafety() - self.generator = TMVA.Experimental.Internal.RBatchGenerator(template)( + self.generator = ROOT.Experimental.Internal.ML.RBatchGenerator(template)( self.noded_rdfs, chunk_size, block_size, @@ -318,7 +318,7 @@ def GetSample(self): try: import numpy as np except ImportError: - raise ImportError("Failed to import numpy in batchgenerator init") + raise ImportError("Failed to import numpy needed for the ML dataloader") # Split the target and weight if not self.target_given: @@ -349,7 +349,7 @@ def ConvertBatchToNumpy(self, batch) -> np.ndarray: """Convert a RTensor into a NumPy array Args: - batch (RTensor): Batch returned from the RBatchGenerator + batch (RTensor): Batch returned from the DataLoader Returns: np.ndarray: converted batch @@ -357,7 +357,7 @@ def ConvertBatchToNumpy(self, batch) -> np.ndarray: try: import numpy as np except ImportError: - raise ImportError("Failed to import numpy in batchgenerator init") + raise ImportError("Failed to import numpy needed for the ML dataloader") data = batch.GetData() batch_size, num_columns = tuple(batch.GetShape()) @@ -391,7 +391,7 @@ def ConvertBatchToPyTorch(self, batch: Any) -> torch.Tensor: """Convert a RTensor into a PyTorch tensor Args: - batch (RTensor): Batch returned from the RBatchGenerator + batch (RTensor): Batch returned from the DataLoader Returns: torch.Tensor: converted batch @@ -432,7 +432,7 @@ def ConvertBatchToTF(self, batch: Any) -> Any: Convert a RTensor into a TensorFlow tensor Args: - batch (RTensor): Batch returned from the RBatchGenerator + batch (RTensor): Batch returned from the DataLoader Returns: tensorflow.Tensor: converted batch @@ -510,7 +510,7 @@ def __exit__(self, type, value, traceback): return True -class TrainRBatchGenerator: +class TrainDataLoader: def __init__(self, base_generator: BaseGenerator, conversion_function: Callable): """ A generator that returns the training batches of the given @@ -602,11 +602,11 @@ def __exit__(self, type, value, traceback): return True -class ValidationRBatchGenerator: +class ValidationDataLoader: def __init__(self, base_generator: BaseGenerator, conversion_function: Callable): """ A generator that returns the validation batches of the given base - generator. NOTE: The ValidationRBatchGenerator only returns batches + generator. NOTE: The ValidationDataLoader only returns batches if the training has been run. Args: @@ -692,7 +692,7 @@ def CreateNumPyGenerators( sampling_type: str = "", sampling_ratio: float = 1.0, replacement: bool = False, -) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: +) -> Tuple[TrainDataLoader, ValidationDataLoader]: """ Return two batch generators based on the given ROOT file and tree or RDataFrame The first generator returns training batches, while the second generator @@ -758,9 +758,9 @@ def CreateNumPyGenerators( Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: - TrainRBatchGenerator or - Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: - If validation split is 0, return TrainBatchGenerator. + TrainDataLoader or + Tuple[TrainDataLoader, ValidationDataLoader]: + If validation split is 0, return TrainDataLoader. Otherwise two generators are returned. One used to load training batches, and one to load validation batches. NOTE: the validation @@ -789,12 +789,12 @@ def CreateNumPyGenerators( replacement, ) - train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToNumpy) + train_generator = TrainDataLoader(base_generator, base_generator.ConvertBatchToNumpy) if validation_split == 0.0: return train_generator, None - validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToNumpy) + validation_generator = ValidationDataLoader(base_generator, base_generator.ConvertBatchToNumpy) return train_generator, validation_generator @@ -884,9 +884,9 @@ def CreateTFDatasets( Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: - TrainRBatchGenerator or - Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: - If validation split is 0, return TrainBatchGenerator. + TrainDataLoader or + Tuple[TrainDataLoader, ValidationDataLoader]: + If validation split is 0, return TrainDataLoader. Otherwise two generators are returned. One used to load training batches, and one to load validation batches. NOTE: the validation @@ -916,8 +916,8 @@ def CreateTFDatasets( replacement, ) - train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToTF) - validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToTF) + train_generator = TrainDataLoader(base_generator, base_generator.ConvertBatchToTF) + validation_generator = ValidationDataLoader(base_generator, base_generator.ConvertBatchToTF) num_train_columns = len(train_generator.train_columns) num_target_columns = len(train_generator.target_columns) @@ -984,7 +984,7 @@ def CreatePyTorchGenerators( sampling_type: str = "", sampling_ratio: float = 1.0, replacement: bool = False, -) -> Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: +) -> Tuple[TrainDataLoader, ValidationDataLoader]: """ Return two Tensorflow Datasets based on the given ROOT file and tree or RDataFrame The first generator returns training batches, while the second generator @@ -1050,9 +1050,9 @@ def CreatePyTorchGenerators( Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: - TrainRBatchGenerator or - Tuple[TrainRBatchGenerator, ValidationRBatchGenerator]: - If validation split is 0, return TrainBatchGenerator. + TrainDataLoader or + Tuple[TrainDataLoader, ValidationDataLoader]: + If validation split is 0, return TrainDataLoader. Otherwise two generators are returned. One used to load training batches, and one to load validation batches. NOTE: the validation @@ -1080,11 +1080,28 @@ def CreatePyTorchGenerators( replacement, ) - train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToPyTorch) + train_generator = TrainDataLoader(base_generator, base_generator.ConvertBatchToPyTorch) if validation_split == 0.0: return train_generator - validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToPyTorch) + validation_generator = ValidationDataLoader(base_generator, base_generator.ConvertBatchToPyTorch) return train_generator, validation_generator + + +def _inject_dataloader_api(parentmodule): + """ + Inject the public Python API in the ROOT.IO.ML namespace. This includes the + functions to create dataloaders for ML training. + """ + + fns = [ + CreateNumPyGenerators, + CreateTFDatasets, + CreatePyTorchGenerators, + ] + + for python_func in fns: + func_name = python_func.__name__ + setattr(parentmodule, func_name, python_func) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py index e271f232326e3..acb4c6264f278 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/__init__.py @@ -16,27 +16,6 @@ from ._dataloader import DataLoader from ._factory import Factory - -def inject_rbatchgenerator(ns): - from ._batchgenerator import ( - CreateNumPyGenerators, - CreatePyTorchGenerators, - CreateTFDatasets, - ) - - python_batchgenerator_functions = [ - CreateNumPyGenerators, - CreateTFDatasets, - CreatePyTorchGenerators, - ] - - for python_func in python_batchgenerator_functions: - func_name = python_func.__name__ - setattr(ns.Experimental, func_name, python_func) - - return ns - - # list of python classes that are used to pythonize TMVA classes python_classes = [Factory, DataLoader, CrossValidation] @@ -71,7 +50,6 @@ def get_defined_attributes(klass, consider_base_classes=False): method_resolution_order.remove(object) def is_defined(funcname): - if funcname in blacklist: return False @@ -138,7 +116,6 @@ def pythonize_tmva(klass, name): func_names = get_defined_attributes(python_klass) for func_name in func_names: - # if the TMVA class already has a function with the same name as our # pythonization, we rename it and prefix it with an underscore if hasattr(klass, func_name): diff --git a/bindings/pyroot/pythonizations/test/CMakeLists.txt b/bindings/pyroot/pythonizations/test/CMakeLists.txt index bd7852dfd0981..f1c081123fcbf 100644 --- a/bindings/pyroot/pythonizations/test/CMakeLists.txt +++ b/bindings/pyroot/pythonizations/test/CMakeLists.txt @@ -187,9 +187,9 @@ ROOT_ADD_PYUNITTEST(pyroot_tcomplex tcomplex_operators.py) # Tests with memory usage ROOT_ADD_PYUNITTEST(pyroot_memory memory.py) -# rbatchgenerator tests -if (tmva) - ROOT_ADD_PYUNITTEST(batchgen rbatchgenerator_completeness.py PYTHON_DEPS numpy tensorflow torch) +# ML dataloader tests +if (dataframe) + ROOT_ADD_PYUNITTEST(ml_dataloader ml_dataloader.py PYTHON_DEPS numpy tensorflow torch) endif() ROOT_ADD_PYUNITTEST(regression_18441 regression_18441.py) diff --git a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py b/bindings/pyroot/pythonizations/test/ml_dataloader.py similarity index 97% rename from bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py rename to bindings/pyroot/pythonizations/test/ml_dataloader.py index 3b0ad08a60d69..eb6326dc2f19e 100644 --- a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py +++ b/bindings/pyroot/pythonizations/test/ml_dataloader.py @@ -6,7 +6,7 @@ import ROOT -class RBatchGeneratorMultipleFiles(unittest.TestCase): +class DataLoaderMultipleFiles(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" file_name3 = "vector_columns.root" @@ -54,7 +54,7 @@ def test01_each_element_is_generated_unshuffled(self): entries_before = df.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -125,7 +125,7 @@ def test02_each_element_is_generated_shuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -188,7 +188,7 @@ def test03_chunk_input_smaller_than_batch_size(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=3, @@ -213,7 +213,7 @@ def test04_dropping_remainder(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -255,7 +255,7 @@ def test05_more_than_one_file(self): try: df = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -315,7 +315,7 @@ def test06_multiple_target_columns(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -400,7 +400,7 @@ def test07_multiple_input_columns(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -470,7 +470,7 @@ def test08_filtered(self): filter_entries_before = dff.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( dff, batch_size=3, chunk_size=5, @@ -540,7 +540,7 @@ def test09_filtered_last_chunk(self): dff = df.Filter("b1 % 2 == 0", "name") - gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, _ = ROOT.Experimental.ML.CreateNumPyGenerators( dff, batch_size=3, chunk_size=9, @@ -590,7 +590,7 @@ def test10_two_epochs_shuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -657,7 +657,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -698,7 +698,7 @@ def test12_PyTorch(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( df, batch_size=3, chunk_size=5, @@ -783,7 +783,7 @@ def test13_TensorFlow(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + gen_train, gen_validation = ROOT.Experimental.ML.CreateTFDatasets( df, batch_size=3, chunk_size=5, @@ -882,7 +882,7 @@ def test(size_of_batch, size_of_chunk, num_of_entries): try: df = ROOT.RDataFrame(tree_name, file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=size_of_batch, chunk_size=size_of_chunk, @@ -970,7 +970,7 @@ def test15_two_runs_set_seed(self): df = ROOT.RDataFrame(self.tree_name, self.file_name1) for _ in range(2): - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -1034,7 +1034,7 @@ def test16_vector_padding(self): df = ROOT.RDataFrame(self.tree_name, self.file_name3) max_vec_sizes = {"v1": 3, "v2": 2} - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, @@ -1148,7 +1148,7 @@ def test16_vector_padding(self): raise -class RBatchGeneratorEagerLoading(unittest.TestCase): +class DataLoaderEagerLoading(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" file_name3 = "vector_columns.root" @@ -1194,7 +1194,7 @@ def test01_each_element_is_generated_unshuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -1259,7 +1259,7 @@ def test02_each_element_is_generated_shuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", validation_split=0.4, shuffle=True, drop_remainder=False, load_eager=True ) @@ -1313,7 +1313,7 @@ def test04_dropping_remainder(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", validation_split=0.4, shuffle=False, drop_remainder=True, load_eager=True ) @@ -1348,7 +1348,7 @@ def test05_more_than_one_file(self): try: df = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -1407,7 +1407,7 @@ def test06_multiple_target_columns(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target=["b2", "b4"], @@ -1491,7 +1491,7 @@ def test07_multiple_input_columns(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -1558,7 +1558,7 @@ def test08_filtered(self): dff = df.Filter("b1 % 2 == 0", "name") - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( dff, batch_size=3, target="b2", @@ -1622,7 +1622,7 @@ def test09_filtered_last_chunk(self): dff = df.Filter("b1 % 2 == 0", "name") - gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, _ = ROOT.Experimental.ML.CreateNumPyGenerators( dff, batch_size=3, target="b2", validation_split=0, shuffle=False, drop_remainder=False, load_eager=True ) @@ -1665,7 +1665,7 @@ def test10_two_epochs_shuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -1731,7 +1731,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -1771,7 +1771,7 @@ def test12_PyTorch(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( df, batch_size=3, target=["b2", "b4"], @@ -1855,7 +1855,7 @@ def test13_TensorFlow(self): try: df = ROOT.RDataFrame("myTree", file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + gen_train, gen_validation = ROOT.Experimental.ML.CreateTFDatasets( df, batch_size=3, target=["b2", "b4"], @@ -1953,7 +1953,7 @@ def test(size_of_batch, size_of_chunk, num_of_entries): try: df = ROOT.RDataFrame(tree_name, file_name) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=size_of_batch, target=["b3", "b5"], @@ -2040,7 +2040,7 @@ def test15_two_runs_set_seed(self): df = ROOT.RDataFrame(self.tree_name, self.file_name1) for _ in range(2): - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b2", @@ -2103,7 +2103,7 @@ def test16_vector_padding(self): df = ROOT.RDataFrame(self.tree_name, self.file_name3) max_vec_sizes = {"v1": 3, "v2": 2} - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( df, batch_size=3, target="b1", @@ -2216,7 +2216,7 @@ def test16_vector_padding(self): raise -class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): +class DataLoaderEagerLoadingMultipleDataframes(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" file_name3 = "second_file.root" @@ -2286,7 +2286,7 @@ def test01_each_element_is_generated_unshuffled(self): df1_entries_before = df1.AsNumpy(["rdfentry_"])["rdfentry_"] df2_entries_before = df2.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2362,7 +2362,7 @@ def test02_each_element_is_generated_shuffled(self): df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2426,7 +2426,7 @@ def test04_dropping_remainder(self): df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2471,7 +2471,7 @@ def test05_more_than_one_file(self): df1 = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) df2 = ROOT.RDataFrame(self.tree_name, self.file_name3) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2537,7 +2537,7 @@ def test06_multiple_target_columns(self): df1 = ROOT.RDataFrame("myTree", file_name1) df2 = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target=["b2", "b4"], @@ -2629,7 +2629,7 @@ def test07_multiple_input_columns(self): df1 = ROOT.RDataFrame("myTree", file_name1) df2 = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2704,7 +2704,7 @@ def test08_filtered(self): dff1_entries_before = dff1.AsNumpy(["rdfentry_"])["rdfentry_"] dff2_entries_before = dff2.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [dff1, dff2], batch_size=3, target="b2", @@ -2784,7 +2784,7 @@ def test09_filtered_last_chunk(self): dff1 = df1.Filter("b1 % 2 == 0", "name") dff2 = df2.Filter("b1 % 2 == 0", "name") - gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, _ = ROOT.Experimental.ML.CreateNumPyGenerators( [dff1, dff2], batch_size=3, target="b2", @@ -2837,7 +2837,7 @@ def test10_two_epochs_shuffled(self): df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2906,7 +2906,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -2953,7 +2953,7 @@ def test12_PyTorch(self): df1 = ROOT.RDataFrame("myTree", file_name1) df2 = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( [df1, df2], batch_size=3, target=["b2", "b4"], @@ -3044,7 +3044,7 @@ def test13_TensorFlow(self): df1 = ROOT.RDataFrame("myTree", file_name1) df2 = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( + gen_train, gen_validation = ROOT.Experimental.ML.CreateTFDatasets( [df1, df2], batch_size=3, target=["b2", "b4"], @@ -3147,7 +3147,7 @@ def test(size_of_batch, size_of_chunk, num_of_entries): df1 = ROOT.RDataFrame(tree_name, file_name1) df2 = ROOT.RDataFrame(tree_name, file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=size_of_batch, target=["b3", "b5"], @@ -3238,7 +3238,7 @@ def test15_two_runs_set_seed(self): df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) for _ in range(2): - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b2", @@ -3304,7 +3304,7 @@ def test16_vector_padding(self): df2 = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b1", @@ -3419,7 +3419,7 @@ def test16_vector_padding(self): raise -class RBatchGeneratorRandomUndersampling(unittest.TestCase): +class DataLoaderRandomUndersampling(unittest.TestCase): file_name1 = "major.root" file_name2 = "minor.root" file_name3 = "second_file.root" @@ -3489,7 +3489,7 @@ def test01_each_element_is_generated_unshuffled(self): major_entries_before = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -3585,7 +3585,7 @@ def test01_each_element_is_generated_unshuffled_replacement(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -3706,7 +3706,7 @@ def test02_each_element_is_generated_shuffled(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -3773,7 +3773,7 @@ def test04_dropping_remainder(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -3826,7 +3826,7 @@ def test05_more_than_one_file(self): df_major = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name3]) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -3906,7 +3906,7 @@ def test06_multiple_target_columns(self): df_major = ROOT.RDataFrame("myTree", file_name1) df_minor = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -4030,7 +4030,7 @@ def test07_multiple_input_columns(self): df_major = ROOT.RDataFrame("myTree", file_name1) df_minor = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -4126,7 +4126,7 @@ def test08_filtered(self): major_filter_entries_before = df_major_filter.AsNumpy(["rdfentry_"])["rdfentry_"] minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major_filter, df_minor], batch_size=2, target="b2", @@ -4215,7 +4215,7 @@ def test10_two_epochs_shuffled(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -4286,7 +4286,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -4335,7 +4335,7 @@ def test12_PyTorch(self): df_minor = ROOT.RDataFrame("myTree", file_name1) df_major = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -4457,7 +4457,7 @@ def test13_TensorFlow(self): df_minor = ROOT.RDataFrame("myTree", file_name1) df_major = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -4602,7 +4602,7 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat df1 = ROOT.RDataFrame(tree_name, file_name1) df2 = ROOT.RDataFrame(tree_name, file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=size_of_batch, target=["b3", "b5"], @@ -4724,7 +4724,7 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat df1 = ROOT.RDataFrame(tree_name, file_name1) df2 = ROOT.RDataFrame(tree_name, file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=size_of_batch, target=["b3", "b5"], @@ -4819,7 +4819,7 @@ def test15_two_runs_set_seed(self): df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) for _ in range(2): - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -4887,7 +4887,7 @@ def test16_vector_padding(self): df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b1", @@ -5040,7 +5040,7 @@ def test16_vector_padding(self): raise -class RBatchGeneratorRandomOversampling(unittest.TestCase): +class DataLoaderRandomOversampling(unittest.TestCase): file_name1 = "major.root" file_name2 = "minor.root" file_name3 = "second_file.root" @@ -5110,7 +5110,7 @@ def test01_each_element_is_generated_unshuffled(self): major_entries_before = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5205,7 +5205,7 @@ def test02_each_element_is_generated_shuffled(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5278,7 +5278,7 @@ def test04_dropping_remainder(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5330,7 +5330,7 @@ def test05_more_than_one_file(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, [self.file_name2, self.file_name3]) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5409,7 +5409,7 @@ def test06_multiple_target_columns(self): df_major = ROOT.RDataFrame("myTree", file_name1) df_minor = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -5532,7 +5532,7 @@ def test07_multiple_input_columns(self): df_major = ROOT.RDataFrame("myTree", file_name1) df_minor = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5626,7 +5626,7 @@ def test08_filtered(self): major_entries_before = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] minor_filter_entries_before = df_minor_filter.AsNumpy(["rdfentry_"])["rdfentry_"] - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor_filter], batch_size=2, target="b2", @@ -5714,7 +5714,7 @@ def test10_two_epochs_shuffled(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5784,7 +5784,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -5832,7 +5832,7 @@ def test12_PyTorch(self): df_minor = ROOT.RDataFrame("myTree", file_name1) df_major = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -5953,7 +5953,7 @@ def test13_TensorFlow(self): df_minor = ROOT.RDataFrame("myTree", file_name1) df_major = ROOT.RDataFrame("myTree", file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( [df_minor, df_major], batch_size=2, target=["b2", "b4"], @@ -6096,7 +6096,7 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat df1 = ROOT.RDataFrame(tree_name, file_name1) df2 = ROOT.RDataFrame(tree_name, file_name2) - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df1, df2], batch_size=size_of_batch, target=["b3", "b5"], @@ -6190,7 +6190,7 @@ def test15_two_runs_set_seed(self): df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) for _ in range(2): - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b2", @@ -6257,7 +6257,7 @@ def test16_vector_padding(self): df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} - gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( + gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, target="b1", diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index 52be7ff086aa4..b7fd93efc928b 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -443,14 +443,6 @@ ROOT_STANDARD_LIBRARY_PACKAGE(TMVAUtils TMVA/RInferenceUtils.hxx TMVA/RBDT.hxx TMVA/RSofieReader.hxx - TMVA/BatchGenerator/RBatchGenerator.hxx - TMVA/BatchGenerator/RBatchLoader.hxx - TMVA/BatchGenerator/RChunkLoader.hxx - TMVA/BatchGenerator/RChunkConstructor.hxx - TMVA/BatchGenerator/RFlat2DMatrix.hxx - TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx - TMVA/BatchGenerator/RDatasetLoader.hxx - TMVA/BatchGenerator/RSampler.hxx SOURCES diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx deleted file mode 100644 index b9e8d8c1a14a8..0000000000000 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx +++ /dev/null @@ -1,120 +0,0 @@ -// Author: Martin Føll, University of Oslo (UiO) & CERN 1/2026 - -/************************************************************************* - * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * - * All rights reserved. * - * * - * For the licensing terms see $ROOTSYS/LICENSE. * - * For the list of contributors see $ROOTSYS/README/CREDITS. * - *************************************************************************/ - -#ifndef TMVA_RFLAT2DMATRIXOPERATORS -#define TMVA_RFLAT2DMATRIXOPERATORS - -#include -#include - -#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" - -namespace TMVA::Experimental::Internal { -// clang-format off -/** -\class ROOT::TMVA::Experimental::Internal::RFlat2DMatrixOperators -\ingroup tmva -\brief Collection of operations applied to one or multiple flat 2D matrices. -*/ - -class RFlat2DMatrixOperators { -private: - // clang-format on - bool fShuffle; - std::size_t fSetSeed; -public: - RFlat2DMatrixOperators(bool shuffle = true, const std::size_t setSeed = 0) - : fShuffle(shuffle), - fSetSeed(setSeed) - { - - } - - void ShuffleTensor(RFlat2DMatrix &ShuffledTensor, RFlat2DMatrix &Tensor ) - { - if (fShuffle) { - std::random_device rd; - std::mt19937 g; - - if (fSetSeed == 0) { - g.seed(rd()); - } else { - g.seed(fSetSeed); - } - - std::size_t rows = Tensor.GetRows(); - std::size_t cols = Tensor.GetCols(); - ShuffledTensor.Resize(rows, cols); - - // make an identity permutation map - std::vector indices(rows); - std::iota(indices.begin(), indices.end(), 0); - - // shuffle the identity permutation to create a new permutation - std::shuffle(indices.begin(), indices.end(), g); - - // shuffle data in the tensor with the permutation map defined above - for (std::size_t i = 0; i < rows; i++) { - std::copy(Tensor.GetData() + indices[i] * cols, - Tensor.GetData() + (indices[i] + 1) * cols, - ShuffledTensor.GetData() + i * cols); - } - } - else { - ShuffledTensor = Tensor; - } - } - - void SliceTensor(RFlat2DMatrix& SlicedTensor, RFlat2DMatrix& Tensor, - const std::vector>& slice) - { - const auto& rowSlice = slice[0]; - const auto& colSlice = slice[1]; - - std::size_t rowStart = rowSlice[0]; - std::size_t rowEnd = rowSlice[1]; - std::size_t colStart = colSlice[0]; - std::size_t colEnd = colSlice[1]; - - std::size_t rows = rowEnd - rowStart; - std::size_t cols = colEnd - colStart; - - SlicedTensor.Resize(rows, cols); - std::copy(Tensor.GetData() + rowStart * cols, - Tensor.GetData() + rowStart * cols + rows * cols, - SlicedTensor.GetData()); - } - - void ConcatenateTensors(RFlat2DMatrix &ConcatTensor, const std::vector &Tensors) - { - std::size_t cols = Tensors[0].GetCols(); - std::size_t rows = 0; - - for (const auto& t : Tensors) { - rows += t.GetRows(); - } - - ConcatTensor.Resize(rows, cols); - - std::size_t index = 0; - for (std::size_t i = 0; i < Tensors.size(); i++) { - std::size_t tensorRows = Tensors[i].GetRows(); - std::copy(Tensors[i].GetData(), - Tensors[i].GetData() + tensorRows * cols, - ConcatTensor.GetData() + index * cols); - index += tensorRows; - } - } - - -}; - -} // namespace TMVA::Experimental::Internal -#endif // ROOT_TMVA_RFLAT2DMATRIXOPERATORS diff --git a/tree/CMakeLists.txt b/tree/CMakeLists.txt index 969988c68b488..6d01e980cc9c6 100644 --- a/tree/CMakeLists.txt +++ b/tree/CMakeLists.txt @@ -17,3 +17,4 @@ add_subdirectory(dataframe) add_subdirectory(ntuple) add_subdirectory(ntupleutil) add_subdirectory(readspeed) +add_subdirectory(ml) diff --git a/tree/ml/CMakeLists.txt b/tree/ml/CMakeLists.txt new file mode 100644 index 0000000000000..ffb2db2714060 --- /dev/null +++ b/tree/ml/CMakeLists.txt @@ -0,0 +1,18 @@ +if(NOT pyroot OR NOT dataframe) + return() +endif() + +ROOT_STANDARD_LIBRARY_PACKAGE(ROOTMLDataLoader + HEADERS + ROOT/ML/RBatchGenerator.hxx + ROOT/ML/RBatchLoader.hxx + ROOT/ML/RChunkLoader.hxx + ROOT/ML/RChunkConstructor.hxx + ROOT/ML/RFlat2DMatrix.hxx + ROOT/ML/RFlat2DMatrixOperators.hxx + ROOT/ML/RDatasetLoader.hxx + ROOT/ML/RSampler.hxx + NO_SOURCES + DEPENDENCIES + ROOTDataFrame +) diff --git a/tree/ml/LinkDef.h b/tree/ml/LinkDef.h new file mode 100644 index 0000000000000..f9bbaea21aa77 --- /dev/null +++ b/tree/ml/LinkDef.h @@ -0,0 +1,12 @@ +#ifdef __CLING__ +#ifndef R__USE_CXXMODULES +// The following is needed only when C++ modules are not used +// - namespace is needed to enable autoloading, based on the namespace name +// - dictionary request for class template instantiation is needed to allow cppyy to request instantiation of any other +// variation of the template. It leads to forward declaring the class template both in the generated rootmap and the +// corresponding dictionary source file, and apparently only the second one is necessary for cppyy. +#pragma link C++ namespace ROOT::Experimental::ML; +#pragma link C++ namespace ROOT::Experimental::Internal::ML; +#pragma link C++ class ROOT::Experimental::Internal::ML::RBatchGenerator; +#endif +#endif diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx b/tree/ml/inc/ROOT/ML/RBatchGenerator.hxx similarity index 93% rename from tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx rename to tree/ml/inc/ROOT/ML/RBatchGenerator.hxx index ef559b31d6b50..2fd74b6d76cbc 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx +++ b/tree/ml/inc/ROOT/ML/RBatchGenerator.hxx @@ -12,17 +12,17 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RBATCHGENERATOR -#define TMVA_RBATCHGENERATOR +#ifndef ROOT_INTERNAL_ML_RBATCHGENERATOR +#define ROOT_INTERNAL_ML_RBATCHGENERATOR -#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" -#include "TMVA/BatchGenerator/RSampler.hxx" +#include "ROOT/ML/RFlat2DMatrix.hxx" +#include "ROOT/ML/RFlat2DMatrixOperators.hxx" +#include "ROOT/ML/RSampler.hxx" #include "ROOT/RDF/RDatasetSpec.hxx" -#include "TMVA/BatchGenerator/RDatasetLoader.hxx" -#include "TMVA/BatchGenerator/RChunkLoader.hxx" -#include "TMVA/BatchGenerator/RBatchLoader.hxx" +#include "ROOT/ML/RDatasetLoader.hxx" +#include "ROOT/ML/RChunkLoader.hxx" +#include "ROOT/ML/RBatchLoader.hxx" #include "TROOT.h" #include @@ -33,17 +33,18 @@ #include #include -namespace TMVA { -namespace Experimental { -namespace Internal { +// Empty namespace to create a hook for the Pythonization +namespace ROOT::Experimental::ML { +} -// clang-format off +namespace ROOT::Experimental::Internal::ML { /** -\class ROOT::TMVA::Experimental::Internal::RBatchGenerator -\ingroup tmva -\brief +\class ROOT::Experimental::Internal::ML::RBatchGenerator +\brief -In this class, the processes of loading chunks (see RChunkLoader) and creating batches from those chunks (see RBatchLoader) are combined, allowing batches from the training and validation sets to be loaded directly from a dataset in an RDataFrame. +In this class, the processes of loading chunks (see RChunkLoader) and creating batches from those chunks (see +RBatchLoader) are combined, allowing batches from the training and validation sets to be loaded directly from a dataset +in an RDataFrame. */ template @@ -51,7 +52,6 @@ class RBatchGenerator { private: std::vector fCols; std::vector fVecSizes; - // clang-format on std::size_t fChunkSize; std::size_t fMaxChunks; std::size_t fBatchSize; @@ -340,8 +340,6 @@ public: /// Returns empty RTensor otherwise. }; -} // namespace Internal -} // namespace Experimental -} // namespace TMVA +} // namespace ROOT::Experimental::Internal::ML -#endif // TMVA_RBATCHGENERATOR +#endif // ROOT_INTERNAL_ML_RBATCHGENERATOR diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx b/tree/ml/inc/ROOT/ML/RBatchLoader.hxx similarity index 96% rename from tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx rename to tree/ml/inc/ROOT/ML/RBatchLoader.hxx index 1a49e3cad2b99..05ec5e59d416a 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx +++ b/tree/ml/inc/ROOT/ML/RBatchLoader.hxx @@ -12,8 +12,8 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RBATCHLOADER -#define TMVA_RBATCHLOADER +#ifndef ROOT_INTERNAL_ML_RBATCHLOADER +#define ROOT_INTERNAL_ML_RBATCHLOADER #include #include @@ -24,14 +24,12 @@ #include #include -#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" -#include "TMVA/Tools.h" - -namespace TMVA::Experimental::Internal { +#include "ROOT/ML/RFlat2DMatrix.hxx" +namespace ROOT::Experimental::Internal::ML { /** -\class ROOT::TMVA::Experimental::Internal::RBatchLoader -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RBatchLoader + \brief Building and loading the batches from loaded chunks in RChunkLoader In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training @@ -249,6 +247,6 @@ public: std::size_t GetNumBatchQueue() { return fBatchQueue.size(); } }; -} // namespace TMVA::Experimental::Internal +} // namespace ROOT::Experimental::Internal::ML -#endif // TMVA_RBATCHLOADER +#endif // ROOT_INTERNAL_ML_RBATCHLOADER diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx b/tree/ml/inc/ROOT/ML/RChunkConstructor.hxx similarity index 95% rename from tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx rename to tree/ml/inc/ROOT/ML/RChunkConstructor.hxx index 7043d5458c318..e03ce5291d178 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx +++ b/tree/ml/inc/ROOT/ML/RChunkConstructor.hxx @@ -8,8 +8,8 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RCHUNKCONSTRUCTOR -#define TMVA_RCHUNKCONSTRUCTOR +#ifndef ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR +#define ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR #include @@ -19,37 +19,33 @@ #include "ROOT/RLogger.hxx" -namespace TMVA { -namespace Experimental { -namespace Internal { - -// clang-format off +namespace ROOT::Experimental::Internal::ML { /** -\class ROOT::TMVA::Experimental::Internal::RChunkConstructor -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RChunkConstructor + \brief The logic for constructing chunks from a dataset. -This struct handles the logic for splitting a dataset into smaller subsets +This struct handles the logic for splitting a dataset into smaller subsets known as chunks, which are constructed from blocks. - -A chunk is the largest portion of the dataset loaded into memory at once, + +A chunk is the largest portion of the dataset loaded into memory at once, and each chunk is further divided into batches for machine learning training. - + The dataset is split into disjoint chunks based on a user-defined chunk size. There are two types of chunks: - Full chunks: contain exactly the number of entries specified by the chunk size. - Leftover chunk: contains any remaining entries that don't make up a full chunk. - + Each chunk is constructed from blocks based on a user-defined block size. There are two types of blocks: - Full blocks: contain exactly the number of entries specified by the block size. - Leftover block: contains any remaining entries that don't make up a full block. -The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries. +The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number +of entries. */ struct RChunkConstructor { - // clang-format on std::size_t fNumEntries{}; std::size_t fChunkSize{}; std::size_t fBlockSize{}; @@ -237,8 +233,6 @@ struct RChunkConstructor { } } }; -} // namespace Internal -} // namespace Experimental -} // namespace TMVA +} // namespace ROOT::Experimental::Internal::ML -#endif // TMVA_RCHUNKCONSTRUCTOR +#endif // ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx b/tree/ml/inc/ROOT/ML/RChunkLoader.hxx similarity index 89% rename from tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx rename to tree/ml/inc/ROOT/ML/RChunkLoader.hxx index 602ca6a37c5ef..dcac8dc39fa50 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx +++ b/tree/ml/inc/ROOT/ML/RChunkLoader.hxx @@ -12,34 +12,29 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RCHUNKLOADER -#define TMVA_RCHUNKLOADER +#ifndef ROOT_INTERNAL_ML_RCHUNKLOADER +#define ROOT_INTERNAL_ML_RCHUNKLOADER #include #include -#include "TMVA/BatchGenerator/RChunkConstructor.hxx" +#include "ROOT/ML/RChunkConstructor.hxx" #include "ROOT/RDataFrame.hxx" #include "ROOT/RDF/Utils.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/ML/RFlat2DMatrix.hxx" +#include "ROOT/ML/RFlat2DMatrixOperators.hxx" #include "ROOT/RLogger.hxx" -namespace TMVA { -namespace Experimental { -namespace Internal { - -// clang-format off +namespace ROOT::Experimental::Internal::ML { /** -\class ROOT::TMVA::Experimental::Internal::RChunkLoaderFunctor -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RChunkLoaderFunctor + \brief Loading chunks made in RChunkLoader into tensors from data from RDataFrame. */ template class RChunkLoaderFunctor { - // clang-format on std::size_t fOffset{}; std::size_t fVecSizeIdx{}; float fVecPadding{}; @@ -52,7 +47,7 @@ class RChunkLoaderFunctor { int fNumColumns; ////////////////////////////////////////////////////////////////////////// - /// \brief Copy the content of a column into RTensor when the column consits of vectors + /// \brief Copy the content of a column into RTensor when the column consits of vectors template ::value, int> = 0> void AssignToTensor(const T &vec, int i, int numColumns) { @@ -68,11 +63,11 @@ class RChunkLoaderFunctor { { std::copy(vec.begin(), vec.begin() + max_vec_size, dst); } - fOffset += max_vec_size; - } + fOffset += max_vec_size; + } ////////////////////////////////////////////////////////////////////////// - /// \brief Copy the content of a column into RTensor when the column consits of single values + /// \brief Copy the content of a column into RTensor when the column consits of single values template ::value, int> = 0> void AssignToTensor(const T &val, int i, int numColumns) { @@ -82,8 +77,8 @@ class RChunkLoaderFunctor { } public: - RChunkLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, - const std::vector &maxVecSizes, float vecPadding, int i) + RChunkLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector &maxVecSizes, + float vecPadding, int i) : fChunkTensor(chunkTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding), fI(i), fNumColumns(numColumns) { } @@ -95,19 +90,20 @@ public: } }; -// clang-format off /** -\class ROOT::TMVA::Experimental::Internal::RChunkLoader -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RChunkLoader + \brief Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor -In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction. +In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each +chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into +chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well +mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction. */ template class RChunkLoader { private: - // clang-format on std::size_t fNumEntries; std::size_t fChunkSize; std::size_t fBlockSize; @@ -152,7 +148,7 @@ public: fSetSeed(setSeed) { fTensorOperators = std::make_unique(fShuffle, fSetSeed); - + fNumEntries = f_rdf.Count().GetValue(); fEntries = f_rdf.Take("rdfentry_"); @@ -208,7 +204,7 @@ public: std::shuffle(indices.begin(), indices.end(), g); } - // use the permuation to shuffle the vector of block sizes + // use the permuation to shuffle the vector of block sizes std::vector PermutedBlockSizes(BlockSizes.size()); for (int i = 0; i < BlockSizes.size(); ++i) { PermutedBlockSizes[i] = BlockSizes[indices[i]]; @@ -248,7 +244,7 @@ public: } ////////////////////////////////////////////////////////////////////////// - /// \brief Create training chunks consisiting of block intervals of different types + /// \brief Create training chunks consisiting of block intervals of different types void CreateTrainingChunksIntervals() { @@ -288,7 +284,7 @@ public: } ////////////////////////////////////////////////////////////////////////// - /// \brief Create training chunks consisiting of block intervals of different types + /// \brief Create training chunks consisiting of block intervals of different types void CreateValidationChunksIntervals() { std::random_device rd; @@ -328,7 +324,7 @@ public: /// \param[in] TrainChunkTensor RTensor for the training chunk /// \param[in] chunk Index of the chunk in the dataset void LoadTrainingChunk(RFlat2DMatrix &TrainChunkTensor, std::size_t chunk) - { + { std::size_t chunkSize = fTraining->ChunksSizes[chunk]; @@ -339,11 +335,10 @@ public: std::size_t chunkEntry = 0; std::vector> BlocksInChunk = fTraining->ChunksIntervals[chunk]; - std::sort(BlocksInChunk.begin(), BlocksInChunk.end(), - [](const std::pair& a, const std::pair& b) { - return a.first < b.first; - }); - + std::sort( + BlocksInChunk.begin(), BlocksInChunk.end(), + [](const std::pair &a, const std::pair &b) { return a.first < b.first; }); + for (std::size_t i = 0; i < BlocksInChunk.size(); i++) { // Use the block start and end entry to load into the chunk if the dataframe is not filtered @@ -370,7 +365,7 @@ public: // reset dataframe ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, (*fEntries)[0], (*fEntries)[fNumEntries]); - + // shuffle the data in the chunk tensor fTensorOperators->ShuffleTensor(TrainChunkTensor, Tensor); } @@ -381,7 +376,7 @@ public: /// \param[in] ValidationChunkTensor RTensor for the validation chunk /// \param[in] chunk Index of the chunk in the dataset void LoadValidationChunk(RFlat2DMatrix &ValidationChunkTensor, std::size_t chunk) - { + { std::size_t chunkSize = fValidation->ChunksSizes[chunk]; @@ -391,11 +386,10 @@ public: std::size_t chunkEntry = 0; std::vector> BlocksInChunk = fValidation->ChunksIntervals[chunk]; - std::sort(BlocksInChunk.begin(), BlocksInChunk.end(), - [](const std::pair& a, const std::pair& b) { - return a.first < b.first; - }); - + std::sort( + BlocksInChunk.begin(), BlocksInChunk.end(), + [](const std::pair &a, const std::pair &b) { return a.first < b.first; }); + for (std::size_t i = 0; i < BlocksInChunk.size(); i++) { // use the block start and end entry to load into the chunk if the dataframe is not filtered @@ -422,17 +416,14 @@ public: // reset dataframe ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, (*fEntries)[0], (*fEntries)[fNumEntries]); - + // shuffle the data in the chunk tensor - fTensorOperators->ShuffleTensor(ValidationChunkTensor, Tensor); + fTensorOperators->ShuffleTensor(ValidationChunkTensor, Tensor); } } - void ResetDataframe() - { - ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); - } - + void ResetDataframe() { ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, 0, fNumEntries); } + std::vector GetTrainingChunkSizes() { return fTraining->ChunksSizes; } std::vector GetValidationChunkSizes() { return fValidation->ChunksSizes; } @@ -442,7 +433,7 @@ public: void CheckIfUnique(RFlat2DMatrix &Tensor) { const auto &rvec = Tensor.fRVec; - if(std::set(rvec.begin(), rvec.end()).size() == rvec.size()) { + if (std::set(rvec.begin(), rvec.end()).size() == rvec.size()) { std::cout << "Tensor consists of only unique elements" << std::endl; } }; @@ -475,7 +466,5 @@ public: std::size_t GetNumValidationChunks() { return fValidation->Chunks; } }; -} // namespace Internal -} // namespace Experimental -} // namespace TMVA -#endif // TMVA_RCHUNKLOADER +} // namespace ROOT::Experimental::Internal::ML +#endif // ROOT_INTERNAL_ML_RCHUNKLOADER diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx b/tree/ml/inc/ROOT/ML/RDatasetLoader.hxx similarity index 75% rename from tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx rename to tree/ml/inc/ROOT/ML/RDatasetLoader.hxx index f0067b46f414e..1042b22033024 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RDatasetLoader.hxx +++ b/tree/ml/inc/ROOT/ML/RDatasetLoader.hxx @@ -8,35 +8,30 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RDATASETLOADER -#define TMVA_RDATASETLOADER +#ifndef ROOT_INTERNAL_ML_RDATASETLOADER +#define ROOT_INTERNAL_ML_RDATASETLOADER #include #include -#include "TMVA/RTensor.hxx" #include "ROOT/RDataFrame.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrix.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/ML/RFlat2DMatrix.hxx" +#include "ROOT/ML/RFlat2DMatrixOperators.hxx" #include "ROOT/RDF/Utils.hxx" #include "ROOT/RVec.hxx" #include "ROOT/RLogger.hxx" -namespace TMVA { -namespace Experimental { -namespace Internal { +namespace ROOT::Experimental::Internal::ML { -// clang-format off /** -\class ROOT::TMVA::Experimental::Internal::RDatasetLoaderFunctor -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RDatasetLoaderFunctor + \brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame. */ template class RDatasetLoaderFunctor { - // clang-format on std::size_t fOffset{}; std::size_t fVecSizeIdx{}; float fVecPadding{}; @@ -49,7 +44,7 @@ class RDatasetLoaderFunctor { int fNumColumns; ////////////////////////////////////////////////////////////////////////// - /// \brief Copy the content of a column into RTensor when the column consits of vectors + /// \brief Copy the content of a column into RTensor when the column consits of vectors template ::value, int> = 0> void AssignToTensor(const T &vec, int i, int numColumns) { @@ -68,7 +63,7 @@ class RDatasetLoaderFunctor { } ////////////////////////////////////////////////////////////////////////// - /// \brief Copy the content of a column into RTensor when the column consits of single values + /// \brief Copy the content of a column into RTensor when the column consits of single values template ::value, int> = 0> void AssignToTensor(const T &val, int i, int numColumns) { @@ -78,8 +73,12 @@ class RDatasetLoaderFunctor { public: RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, - const std::vector &maxVecSizes, float vecPadding, int i) - : fDatasetTensor(datasetTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding), fI(i), fNumColumns(numColumns) + const std::vector &maxVecSizes, float vecPadding, int i) + : fDatasetTensor(datasetTensor), + fMaxVecSizes(maxVecSizes), + fVecPadding(vecPadding), + fI(i), + fNumColumns(numColumns) { } @@ -90,19 +89,18 @@ public: } }; -// clang-format off /** -\class ROOT::TMVA::Experimental::Internal::RDatasetLoader -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RDatasetLoader + \brief Load the whole dataset into memory. -In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and validation sets with the user-defined validation split fraction. +In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and +validation sets with the user-defined validation split fraction. */ template class RDatasetLoader { private: - // clang-format on std::size_t fNumEntries; float fValidationSplit; @@ -116,12 +114,12 @@ private: RFlat2DMatrix fTrainingDataset; RFlat2DMatrix fValidationDataset; - + std::size_t fNumTrainingEntries; std::size_t fNumValidationEntries; std::unique_ptr fTensorOperators; - - std::vector f_rdfs; + + std::vector f_rdfs; std::vector fCols; std::size_t fNumCols; std::size_t fSetSeed; @@ -143,7 +141,7 @@ public: fShuffle(shuffle), fSetSeed(setSeed) { - fTensorOperators = std::make_unique(fShuffle, fSetSeed); + fTensorOperators = std::make_unique(fShuffle, fSetSeed); fNumCols = fCols.size(); fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); @@ -162,11 +160,11 @@ public: // add the last element in entries to not go out of range when filling chunks Entries->push_back((*Entries)[NumEntries - 1] + 1); - + // number of training and validation entries after the split std::size_t NumValidationEntries = static_cast(fValidationSplit * NumEntries); std::size_t NumTrainingEntries = NumEntries - NumValidationEntries; - + RFlat2DMatrix Dataset({NumEntries, fNumDatasetCols}); bool NotFiltered = rdf.GetFilterNames().empty(); @@ -184,36 +182,37 @@ public: datasetEntry++; } } - + // reset dataframe - ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[0], (*Entries)[NumEntries]); + ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[0], (*Entries)[NumEntries]); RFlat2DMatrix ShuffledDataset({NumEntries, fNumDatasetCols}); fTensorOperators->ShuffleTensor(ShuffledDataset, Dataset); fTensorOperators->SliceTensor(TrainingDataset, ShuffledDataset, {{0, NumTrainingEntries}, {0, fNumDatasetCols}}); - fTensorOperators->SliceTensor(ValidationDataset, ShuffledDataset, {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}}); + fTensorOperators->SliceTensor(ValidationDataset, ShuffledDataset, + {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}}); } ////////////////////////////////////////////////////////////////////////// /// \brief Split the dataframes in a training and validation dataset void SplitDatasets() { - fNumEntries = 0; - fNumTrainingEntries = 0; - fNumValidationEntries = 0; - - for (auto& rdf : f_rdfs) { - RFlat2DMatrix TrainingDataset; - RFlat2DMatrix ValidationDataset; - - SplitDataframe(rdf, TrainingDataset, ValidationDataset); - fTrainingDatasets.push_back(TrainingDataset); - fValidationDatasets.push_back(ValidationDataset); - - fNumTrainingEntries += TrainingDataset.GetRows(); - fNumValidationEntries += ValidationDataset.GetRows(); - fNumEntries += TrainingDataset.GetRows() + ValidationDataset.GetRows(); - } + fNumEntries = 0; + fNumTrainingEntries = 0; + fNumValidationEntries = 0; + + for (auto &rdf : f_rdfs) { + RFlat2DMatrix TrainingDataset; + RFlat2DMatrix ValidationDataset; + + SplitDataframe(rdf, TrainingDataset, ValidationDataset); + fTrainingDatasets.push_back(TrainingDataset); + fValidationDatasets.push_back(ValidationDataset); + + fNumTrainingEntries += TrainingDataset.GetRows(); + fNumValidationEntries += ValidationDataset.GetRows(); + fNumEntries += TrainingDataset.GetRows() + ValidationDataset.GetRows(); + } } ////////////////////////////////////////////////////////////////////////// @@ -221,20 +220,18 @@ public: void ConcatenateDatasets() { fTensorOperators->ConcatenateTensors(fTrainingDataset, fTrainingDatasets); - fTensorOperators->ConcatenateTensors(fValidationDataset, fValidationDatasets); + fTensorOperators->ConcatenateTensors(fValidationDataset, fValidationDatasets); } - - std::vector GetTrainingDatasets() {return fTrainingDatasets;} - std::vector GetValidationDatasets() {return fValidationDatasets;} - - RFlat2DMatrix GetTrainingDataset() {return fTrainingDataset;} - RFlat2DMatrix GetValidationDataset() {return fValidationDataset;} - - std::size_t GetNumTrainingEntries() {return fTrainingDataset.GetRows();} - std::size_t GetNumValidationEntries() {return fValidationDataset.GetRows();} + + std::vector GetTrainingDatasets() { return fTrainingDatasets; } + std::vector GetValidationDatasets() { return fValidationDatasets; } + + RFlat2DMatrix GetTrainingDataset() { return fTrainingDataset; } + RFlat2DMatrix GetValidationDataset() { return fValidationDataset; } + + std::size_t GetNumTrainingEntries() { return fTrainingDataset.GetRows(); } + std::size_t GetNumValidationEntries() { return fValidationDataset.GetRows(); } }; -} // namespace Internal -} // namespace Experimental -} // namespace TMVA -#endif // TMVA_RDATASETLOADER +} // namespace ROOT::Experimental::Internal::ML +#endif // ROOT_INTERNAL_ML_RDATASETLOADER diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrix.hxx b/tree/ml/inc/ROOT/ML/RFlat2DMatrix.hxx similarity index 85% rename from tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrix.hxx rename to tree/ml/inc/ROOT/ML/RFlat2DMatrix.hxx index 67f4ffa60e9d7..c7cc7143510e8 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RFlat2DMatrix.hxx +++ b/tree/ml/inc/ROOT/ML/RFlat2DMatrix.hxx @@ -1,12 +1,12 @@ -#ifndef ROOT_TMVA_RFLAT2DMATRIX -#define ROOT_TMVA_RFLAT2DMATRIX +#ifndef ROOT_INTERNAL_ML_RFLAT2DMATRIX +#define ROOT_INTERNAL_ML_RFLAT2DMATRIX #include #include #include "ROOT/RVec.hxx" -namespace TMVA::Experimental::Internal { +namespace ROOT::Experimental::Internal::ML { /// \brief Wrapper around ROOT::RVec representing a 2D matrix /// /// The storage is flattened row-major: index(row, col) == row * cols + col. @@ -52,5 +52,5 @@ struct RFlat2DMatrix { const float &operator[](std::size_t i) const { return fRVec[i]; } }; -} // namespace TMVA::Experimental::Internal -#endif // ROOT_TMVA_RFLAT2DMATRIX +} // namespace ROOT::Experimental::Internal::ML +#endif // ROOT_INTERNAL_ML_RFLAT2DMATRIX diff --git a/tree/ml/inc/ROOT/ML/RFlat2DMatrixOperators.hxx b/tree/ml/inc/ROOT/ML/RFlat2DMatrixOperators.hxx new file mode 100644 index 0000000000000..fae3a5faab29b --- /dev/null +++ b/tree/ml/inc/ROOT/ML/RFlat2DMatrixOperators.hxx @@ -0,0 +1,108 @@ +// Author: Martin Føll, University of Oslo (UiO) & CERN 1/2026 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef ROOT_INTERNAL_ML_RFLAT2DMATRIXOPERATORS +#define ROOT_INTERNAL_ML_RFLAT2DMATRIXOPERATORS + +#include +#include + +#include "ROOT/ML/RFlat2DMatrix.hxx" + +namespace ROOT::Experimental::Internal::ML { +/** +\class ROOT::Experimental::Internal::ML::RFlat2DMatrixOperators + +\brief Collection of operations applied to one or multiple flat 2D matrices. +*/ + +class RFlat2DMatrixOperators { +private: + bool fShuffle; + std::size_t fSetSeed; + +public: + RFlat2DMatrixOperators(bool shuffle = true, const std::size_t setSeed = 0) : fShuffle(shuffle), fSetSeed(setSeed) {} + + void ShuffleTensor(RFlat2DMatrix &ShuffledTensor, RFlat2DMatrix &Tensor) + { + if (fShuffle) { + std::random_device rd; + std::mt19937 g; + + if (fSetSeed == 0) { + g.seed(rd()); + } else { + g.seed(fSetSeed); + } + + std::size_t rows = Tensor.GetRows(); + std::size_t cols = Tensor.GetCols(); + ShuffledTensor.Resize(rows, cols); + + // make an identity permutation map + std::vector indices(rows); + std::iota(indices.begin(), indices.end(), 0); + + // shuffle the identity permutation to create a new permutation + std::shuffle(indices.begin(), indices.end(), g); + + // shuffle data in the tensor with the permutation map defined above + for (std::size_t i = 0; i < rows; i++) { + std::copy(Tensor.GetData() + indices[i] * cols, Tensor.GetData() + (indices[i] + 1) * cols, + ShuffledTensor.GetData() + i * cols); + } + } else { + ShuffledTensor = Tensor; + } + } + + void + SliceTensor(RFlat2DMatrix &SlicedTensor, RFlat2DMatrix &Tensor, const std::vector> &slice) + { + const auto &rowSlice = slice[0]; + const auto &colSlice = slice[1]; + + std::size_t rowStart = rowSlice[0]; + std::size_t rowEnd = rowSlice[1]; + std::size_t colStart = colSlice[0]; + std::size_t colEnd = colSlice[1]; + + std::size_t rows = rowEnd - rowStart; + std::size_t cols = colEnd - colStart; + + SlicedTensor.Resize(rows, cols); + std::copy(Tensor.GetData() + rowStart * cols, Tensor.GetData() + rowStart * cols + rows * cols, + SlicedTensor.GetData()); + } + + void ConcatenateTensors(RFlat2DMatrix &ConcatTensor, const std::vector &Tensors) + { + std::size_t cols = Tensors[0].GetCols(); + std::size_t rows = 0; + + for (const auto &t : Tensors) { + rows += t.GetRows(); + } + + ConcatTensor.Resize(rows, cols); + + std::size_t index = 0; + for (std::size_t i = 0; i < Tensors.size(); i++) { + std::size_t tensorRows = Tensors[i].GetRows(); + std::copy(Tensors[i].GetData(), Tensors[i].GetData() + tensorRows * cols, + ConcatTensor.GetData() + index * cols); + index += tensorRows; + } + } +}; + +} // namespace ROOT::Experimental::Internal::ML +#endif // ROOT_INTERNAL_ML_RFLAT2DMATRIXOPERATORS diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx b/tree/ml/inc/ROOT/ML/RSampler.hxx similarity index 80% rename from tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx rename to tree/ml/inc/ROOT/ML/RSampler.hxx index 7c3415c35f768..0dbbe812c6278 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RSampler.hxx +++ b/tree/ml/inc/ROOT/ML/RSampler.hxx @@ -8,8 +8,8 @@ * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ -#ifndef TMVA_RSAMPLER -#define TMVA_RSAMPLER +#ifndef ROOT_INTERNAL_ML_RSAMPLER +#define ROOT_INTERNAL_ML_RSAMPLER #include #include @@ -18,20 +18,18 @@ #include "ROOT/RDataFrame.hxx" #include "ROOT/RDF/Utils.hxx" #include "ROOT/RVec.hxx" -#include "TMVA/BatchGenerator/RFlat2DMatrixOperators.hxx" +#include "ROOT/ML/RFlat2DMatrixOperators.hxx" #include "ROOT/RLogger.hxx" -namespace TMVA::Experimental::Internal { -// clang-format off +namespace ROOT::Experimental::Internal::ML { /** -\class ROOT::TMVA::Experimental::Internal::RSampler -\ingroup tmva +\class ROOT::Experimental::Internal::ML::RSampler + \brief Implementation of different sampling strategies. */ class RSampler { private: - // clang-format on std::vector &fDatasets; std::string fSampleType; float fSampleRatio; @@ -45,25 +43,26 @@ private: std::size_t fNumMajor; std::size_t fNumMinor; std::size_t fNumResampledMajor; - std::size_t fNumResampledMinor; + std::size_t fNumResampledMinor; std::vector fSamples; - - std::unique_ptr fTensorOperators; + + std::unique_ptr fTensorOperators; + public: - RSampler(std::vector &datasets, const std::string &sampleType, float sampleRatio, - bool replacement = false, bool shuffle = true, std::size_t setSeed = 0) - : fDatasets(datasets), - fSampleType(sampleType), - fSampleRatio(sampleRatio), - fReplacement(replacement), - fShuffle(shuffle), - fSetSeed(setSeed) + RSampler(std::vector &datasets, const std::string &sampleType, float sampleRatio, + bool replacement = false, bool shuffle = true, std::size_t setSeed = 0) + : fDatasets(datasets), + fSampleType(sampleType), + fSampleRatio(sampleRatio), + fReplacement(replacement), + fShuffle(shuffle), + fSetSeed(setSeed) { fTensorOperators = std::make_unique(fShuffle, fSetSeed); - + // setup the sampler for the datasets - SetupSampler(); + SetupSampler(); } ////////////////////////////////////////////////////////////////////////// @@ -72,12 +71,11 @@ public: { if (fSampleType == "undersampling") { SetupRandomUndersampler(); - } - else if (fSampleType == "oversampling") { + } else if (fSampleType == "oversampling") { SetupRandomOversampler(); } } - + ////////////////////////////////////////////////////////////////////////// /// \brief Collection of sampling types /// \param[in] SampledTensor Tensor with all the sampled entries @@ -85,8 +83,7 @@ public: { if (fSampleType == "undersampling") { RandomUndersampler(SampledTensor); - } - else if (fSampleType == "oversampling") { + } else if (fSampleType == "oversampling") { RandomOversampler(SampledTensor); } } @@ -98,15 +95,14 @@ public: if (fDatasets[0].GetRows() > fDatasets[1].GetRows()) { fMajor = 0; fMinor = 1; - } - else { + } else { fMajor = 1; - fMinor = 0; + fMinor = 0; } fNumMajor = fDatasets[fMajor].GetRows(); fNumMinor = fDatasets[fMinor].GetRows(); - fNumResampledMajor = static_cast(fNumMinor / fSampleRatio); + fNumResampledMajor = static_cast(fNumMinor / fSampleRatio); fNumEntries = fNumMinor + fNumResampledMajor; } @@ -117,15 +113,14 @@ public: if (fDatasets[0].GetRows() > fDatasets[1].GetRows()) { fMajor = 0; fMinor = 1; - } - else { + } else { fMajor = 1; - fMinor = 0; + fMinor = 0; } fNumMajor = fDatasets[fMajor].GetRows(); fNumMinor = fDatasets[fMinor].GetRows(); - fNumResampledMinor = static_cast(fSampleRatio * fNumMajor); + fNumResampledMinor = static_cast(fSampleRatio * fNumMajor); fNumEntries = fNumMajor + fNumResampledMinor; } @@ -135,27 +130,28 @@ public: void RandomUndersampler(RFlat2DMatrix &ShuffledTensor) { if (fReplacement) { - SampleWithReplacement(fNumResampledMajor, fNumMajor); + SampleWithReplacement(fNumResampledMajor, fNumMajor); } - + else { SampleWithoutReplacement(fNumResampledMajor, fNumMajor); } - + std::size_t cols = fDatasets[0].GetCols(); ShuffledTensor.Reshape(fNumEntries, cols); RFlat2DMatrix SampledTensor(fNumEntries, cols); RFlat2DMatrix UndersampledMajorTensor(fNumResampledMajor, cols); - + std::size_t index = 0; for (std::size_t i = 0; i < fNumResampledMajor; i++) { - std::copy(fDatasets[fMajor].GetData() + fSamples[i] * cols, fDatasets[fMajor].GetData() + (fSamples[i]+1) * cols, + std::copy(fDatasets[fMajor].GetData() + fSamples[i] * cols, + fDatasets[fMajor].GetData() + (fSamples[i] + 1) * cols, UndersampledMajorTensor.GetData() + index * cols); index++; } fTensorOperators->ConcatenateTensors(SampledTensor, {UndersampledMajorTensor, fDatasets[fMinor]}); - fTensorOperators->ShuffleTensor(ShuffledTensor, SampledTensor); + fTensorOperators->ShuffleTensor(ShuffledTensor, SampledTensor); } ////////////////////////////////////////////////////////////////////////// @@ -163,24 +159,25 @@ public: /// \param[in] SampledTensor Tensor with all the sampled entries void RandomOversampler(RFlat2DMatrix &ShuffledTensor) { - SampleWithReplacement(fNumResampledMinor, fNumMinor); - + SampleWithReplacement(fNumResampledMinor, fNumMinor); + std::size_t cols = fDatasets[0].GetCols(); ShuffledTensor.Reshape(fNumEntries, cols); RFlat2DMatrix SampledTensor(fNumEntries, cols); RFlat2DMatrix OversampledMinorTensor(fNumResampledMinor, cols); - + std::size_t index = 0; for (std::size_t i = 0; i < fNumResampledMinor; i++) { - std::copy(fDatasets[fMinor].GetData() + fSamples[i] * cols, fDatasets[fMinor].GetData() + (fSamples[i]+1) * cols, + std::copy(fDatasets[fMinor].GetData() + fSamples[i] * cols, + fDatasets[fMinor].GetData() + (fSamples[i] + 1) * cols, OversampledMinorTensor.GetData() + index * cols); index++; } fTensorOperators->ConcatenateTensors(SampledTensor, {OversampledMinorTensor, fDatasets[fMajor]}); - fTensorOperators->ShuffleTensor(ShuffledTensor, SampledTensor); + fTensorOperators->ShuffleTensor(ShuffledTensor, SampledTensor); } - + ////////////////////////////////////////////////////////////////////////// /// \brief Add indices with replacement to fSamples /// \param[in] n_samples Number of indices to sample @@ -192,21 +189,21 @@ public: fSamples.reserve(n_samples); for (std::size_t i = 0; i < n_samples; ++i) { std::size_t sample; - if (fShuffle) { + if (fShuffle) { std::random_device rd; std::mt19937 g; - + if (fSetSeed == 0) { g.seed(rd()); } else { g.seed(fSetSeed); } - - sample = dist(g); + + sample = dist(g); } - + else { - sample = i % max; + sample = i % max; } fSamples.push_back(sample); } @@ -225,11 +222,11 @@ public: for (std::size_t i = 0; i < max; ++i) UniqueSamples.push_back(i); - - if (fShuffle) { + + if (fShuffle) { std::random_device rd; std::mt19937 g; - + if (fSetSeed == 0) { g.seed(rd()); } else { @@ -237,14 +234,14 @@ public: } std::shuffle(UniqueSamples.begin(), UniqueSamples.end(), g); } - + for (std::size_t i = 0; i < n_samples; ++i) { fSamples.push_back(UniqueSamples[i]); } } - std::size_t GetNumEntries() { return fNumEntries;} + std::size_t GetNumEntries() { return fNumEntries; } }; -} // namespace TMVA::Experimental::Internal -#endif // TMVA_RSAMPLER +} // namespace ROOT::Experimental::Internal::ML +#endif // ROOT_INTERNAL_ML_RSAMPLER diff --git a/tutorials/CMakeLists.txt b/tutorials/CMakeLists.txt index 5ac1030f83f3a..271c181af53bf 100644 --- a/tutorials/CMakeLists.txt +++ b/tutorials/CMakeLists.txt @@ -71,11 +71,11 @@ set(need_network analysis/dataframe/df027_SQliteDependencyOverVersion.C) #---Tutorials disabled depending on the build components------------- if(MSVC AND NOT win_broken_tests) - # RBatchGenerator tutorials don't work on Windows at the moment. - list(APPEND dataframe_veto machine_learning/RBatchGenerator_NumPy.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_TensorFlow.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_PyTorch.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_filters_vectors.py) + # ML dataloader tutorials don't work on Windows at the moment. + list(APPEND dataframe_veto machine_learning/ml_dataloader_NumPy.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) # df036* and df037* seem to trigger OS errors when trying to delete the # test files created in the tutorials. It is unclear why. list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C) @@ -117,10 +117,10 @@ if (NOT dataframe) list(APPEND dataframe_veto machine_learning/TMVA_SOFIE_RDataFrame*.C) list(APPEND dataframe_veto machine_learning/TMVA_SOFIE_RDataFrame*.py) list(APPEND dataframe_veto machine_learning/TMVA_SOFIE_Inference.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_NumPy.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_TensorFlow.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_PyTorch.py) - list(APPEND dataframe_veto machine_learning/RBatchGenerator_filters_vectors.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_NumPy.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) # RooFit tutorials depending on RDataFrame list(APPEND dataframe_veto roofit/roofit/rf408_RDataFrameToRooFit.C @@ -662,7 +662,7 @@ set (multithreaded machine_learning/TMVA_CNN_Classification.py machine_learning/TMVA_Higgs_Classification.py machine_learning/TMVA_RNN_Classification.py - machine_learning/RBatchGenerator_TensorFlow.py + machine_learning/ml_dataloader_TensorFlow.py ) file(GLOB multithreaded_all_cores RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${multithreaded_all_cores}) file(GLOB multithreaded RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${multithreaded}) @@ -943,7 +943,7 @@ if(pyroot) ) file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/pytorch/*.py - machine_learning/RBatchGenerator_PyTorch.py + machine_learning/ml_dataloader_PyTorch.py ) file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/tmva101_Training.py @@ -958,7 +958,7 @@ if(pyroot) roofit/roofit/rf618_mixture_models.py # uses the xgboost sklearn plugin ) file(GLOB requires_tensorflow RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} - machine_learning/RBatchGenerator_TensorFlow.py + machine_learning/ml_dataloader_TensorFlow.py machine_learning/TMVA_CNN_Classification.py ) diff --git a/tutorials/machine_learning/index.md b/tutorials/machine_learning/index.md index 126f367f3bd85..0a39cc3346db2 100644 --- a/tutorials/machine_learning/index.md +++ b/tutorials/machine_learning/index.md @@ -134,7 +134,7 @@ | **Tutorial** | **Description** | |--------------|-----------------| -| RBatchGenerator_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. | -| RBatchGenerator_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. | -| RBatchGenerator_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. | +| ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. | +| ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. | +| ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. | diff --git a/tutorials/machine_learning/RBatchGenerator_NumPy.py b/tutorials/machine_learning/ml_dataloader_NumPy.py similarity index 81% rename from tutorials/machine_learning/RBatchGenerator_NumPy.py rename to tutorials/machine_learning/ml_dataloader_NumPy.py index e914a44a29d56..839e5e1f7ee70 100644 --- a/tutorials/machine_learning/RBatchGenerator_NumPy.py +++ b/tutorials/machine_learning/ml_dataloader_NumPy.py @@ -23,15 +23,15 @@ num_of_epochs = 2 -gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( +gen_train, gen_validation = ROOT.Experimental.ML.CreateNumPyGenerators( rdataframe, - batch_size, + batch_size, chunk_size, - block_size, - target = target, - validation_split = 0.3, - shuffle = True, - drop_remainder = True + block_size, + target=target, + validation_split=0.3, + shuffle=True, + drop_remainder=True, ) for i in range(num_of_epochs): diff --git a/tutorials/machine_learning/RBatchGenerator_PyTorch.py b/tutorials/machine_learning/ml_dataloader_PyTorch.py similarity index 91% rename from tutorials/machine_learning/RBatchGenerator_PyTorch.py rename to tutorials/machine_learning/ml_dataloader_PyTorch.py index 29533f8098cd8..bc5e1f9c697f1 100644 --- a/tutorials/machine_learning/RBatchGenerator_PyTorch.py +++ b/tutorials/machine_learning/ml_dataloader_PyTorch.py @@ -24,14 +24,14 @@ # Returns two generators that return training and validation batches # as PyTorch tensors. -gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( +gen_train, gen_validation = ROOT.Experimental.ML.CreatePyTorchGenerators( rdataframe, - batch_size, + batch_size, chunk_size, - block_size, - target = target, - validation_split = 0.3, - shuffle = True, + block_size, + target=target, + validation_split=0.3, + shuffle=True, drop_remainder=True, ) @@ -83,7 +83,7 @@ def calc_accuracy(targets, pred): # # Validation # ################################################################# - model.eval() + model.eval() # Evaluate the model on the validation set for i, (x_val, y_val) in enumerate(gen_validation): # Make prediction and calculate accuracy diff --git a/tutorials/machine_learning/RBatchGenerator_TensorFlow.py b/tutorials/machine_learning/ml_dataloader_TensorFlow.py similarity index 82% rename from tutorials/machine_learning/RBatchGenerator_TensorFlow.py rename to tutorials/machine_learning/ml_dataloader_TensorFlow.py index d418d1ef7dfc3..d1c4115b53f23 100644 --- a/tutorials/machine_learning/RBatchGenerator_TensorFlow.py +++ b/tutorials/machine_learning/ml_dataloader_TensorFlow.py @@ -25,15 +25,15 @@ target = ["Type"] # Returns two TF.Dataset for training and validation batches. -ds_train, ds_valid = ROOT.TMVA.Experimental.CreateTFDatasets( +ds_train, ds_valid = ROOT.Experimental.ML.CreateTFDatasets( rdataframe, - batch_size, + batch_size, chunk_size, - block_size, - target = target, - validation_split = 0.3, - shuffle = True, - drop_remainder = True + block_size, + target=target, + validation_split=0.3, + shuffle=True, + drop_remainder=True, ) num_of_epochs = 2 @@ -68,6 +68,10 @@ loss_fn = tf.keras.losses.BinaryCrossentropy() model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"]) -model.fit(ds_train_repeated, steps_per_epoch=train_batches_per_epoch, validation_data=ds_valid_repeated,\ - validation_steps=validation_batches_per_epoch, epochs=num_of_epochs) - +model.fit( + ds_train_repeated, + steps_per_epoch=train_batches_per_epoch, + validation_data=ds_valid_repeated, + validation_steps=validation_batches_per_epoch, + epochs=num_of_epochs, +) diff --git a/tutorials/machine_learning/RBatchGenerator_filters_vectors.py b/tutorials/machine_learning/ml_dataloader_filters_vectors.py similarity index 65% rename from tutorials/machine_learning/RBatchGenerator_filters_vectors.py rename to tutorials/machine_learning/ml_dataloader_filters_vectors.py index 7666e2ee2868f..fdc9923527b6a 100644 --- a/tutorials/machine_learning/RBatchGenerator_filters_vectors.py +++ b/tutorials/machine_learning/ml_dataloader_filters_vectors.py @@ -8,35 +8,32 @@ ################################################## # This tutorial shows the usage of filters and vectors -# when using RBatchGenerator +# when using the ROOT ML dataloader ################################################## import ROOT tree_name = "test_tree" -file_name = ( - ROOT.gROOT.GetTutorialDir().Data() - + "/machine_learning/RBatchGenerator_filters_vectors_hvector.root" -) +file_name = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/ml_dataloader_filters_vectors_hvector.root" chunk_size = 50 # Defines the size of the chunks batch_size = 5 # Defines the size of the returned batches -block_size = 10 # Defines the size of the blocks that builds up a chunk +block_size = 10 # Defines the size of the blocks that builds up a chunk rdataframe = ROOT.RDataFrame(tree_name, file_name) # Define filters, filters must be named -filteredrdf = rdataframe.Filter("f1 > 30", "first_filter")\ - .Filter("f2 < 70", "second_filter")\ - .Filter("f3==true", "third_filter") +filteredrdf = ( + rdataframe.Filter("f1 > 30", "first_filter").Filter("f2 < 70", "second_filter").Filter("f3==true", "third_filter") +) max_vec_sizes = {"f4": 3, "f5": 2, "f6": 1} -ds_train, ds_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( +ds_train, ds_validation = ROOT.Experimental.ML.CreateNumPyGenerators( filteredrdf, batch_size, chunk_size, - block_size, + block_size, validation_split=0.3, max_vec_sizes=max_vec_sizes, shuffle=False, diff --git a/tutorials/machine_learning/RBatchGenerator_filters_vectors_hvector.root b/tutorials/machine_learning/ml_dataloader_filters_vectors_hvector.root similarity index 100% rename from tutorials/machine_learning/RBatchGenerator_filters_vectors_hvector.root rename to tutorials/machine_learning/ml_dataloader_filters_vectors_hvector.root