IntelPython · antonwolfy · Feb 16, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -49,6 +49,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Aligned `dpnp.trim_zeros` with NumPy 2.4 to support a tuple of integers passed with `axis` keyword [#2746](https://github.com/IntelPython/dpnp/pull/2746)
 * Aligned `strides` property of `dpnp.ndarray` with NumPy and CuPy implementations [#2747](https://github.com/IntelPython/dpnp/pull/2747)
 * Extended `dpnp.nan_to_num` to support broadcasting of `nan`, `posinf`, and `neginf` keywords [#2754](https://github.com/IntelPython/dpnp/pull/2754)
+* Changed `dpnp.partition` implementation to reuse `dpnp.sort` where it brings the performance benefit [#2766](https://github.com/IntelPython/dpnp/pull/2766)
 
 ### Deprecated
 

@@ -70,90 +70,27 @@ DPCTLSyclEventRef dpnp_partition_c(DPCTLSyclQueueRef q_ref,
 
     sycl::queue q = *(reinterpret_cast<sycl::queue *>(q_ref));
 
-    if (ndim == 1) // 1d array with C-contiguous data
-    {
-        _DataType *arr = static_cast<_DataType *>(array1_in);
-        _DataType *result = static_cast<_DataType *>(result1);
+    _DataType *arr = static_cast<_DataType *>(array1_in);
+    _DataType *result = static_cast<_DataType *>(result1);
 
-        auto policy = oneapi::dpl::execution::make_device_policy<
-            dpnp_partition_c_kernel<_DataType>>(q);
+    auto policy = oneapi::dpl::execution::make_device_policy<
+        dpnp_partition_c_kernel<_DataType>>(q);
 
-        // fill the result array with data from input one
-        q.memcpy(result, arr, size * sizeof(_DataType)).wait();
+    // fill the result array with data from input one
+    q.memcpy(result, arr, size * sizeof(_DataType)).wait();
 
-        // make a partial sorting such that:
+    // note, a loop for a multidemension input array (size_ > 1) is an
+    // experimental and it isn't tested properly as for now
+    for (size_t i = 0; i < size_; i++) {
+        _DataType *bufptr = result + i * shape_[0];
+
+        // for every slice it makes a partial sorting such that:
         // 1. result[0 <= i < kth]    <= result[kth]
         // 2. result[kth <= i < size] >= result[kth]
         // event-blocking call, no need for wait()
-        std::nth_element(policy, result, result + kth, result + size,
+        std::nth_element(policy, bufptr, bufptr + kth, bufptr + size,
                          dpnp_less_comp());
-        return event_ref;
-    }
-
-    DPNPC_ptr_adapter<_DataType> input1_ptr(q_ref, array1_in, size, true);
-    DPNPC_ptr_adapter<_DataType> input2_ptr(q_ref, array2_in, size, true);
-    DPNPC_ptr_adapter<_DataType> result1_ptr(q_ref, result1, size, true, true);
-    _DataType *arr = input1_ptr.get_ptr();
-    _DataType *arr2 = input2_ptr.get_ptr();
-    _DataType *result = result1_ptr.get_ptr();
-
-    auto arr_to_result_event = q.memcpy(result, arr, size * sizeof(_DataType));
-    arr_to_result_event.wait();
-
-    _DataType *matrix = new _DataType[shape_[ndim - 1]];
-
-    for (size_t i = 0; i < size_; ++i) {
-        size_t ind_begin = i * shape_[ndim - 1];
-        size_t ind_end = (i + 1) * shape_[ndim - 1] - 1;
-
-        for (size_t j = ind_begin; j < ind_end + 1; ++j) {
-            size_t ind = j - ind_begin;
-            matrix[ind] = arr2[j];
-        }
-        std::partial_sort(matrix, matrix + shape_[ndim - 1],
-                          matrix + shape_[ndim - 1], dpnp_less_comp());
-        for (size_t j = ind_begin; j < ind_end + 1; ++j) {
-            size_t ind = j - ind_begin;
-            arr2[j] = matrix[ind];
-        }
     }
-
-    shape_elem_type *shape = reinterpret_cast<shape_elem_type *>(
-        sycl::malloc_shared(ndim * sizeof(shape_elem_type), q));
-    auto memcpy_event = q.memcpy(shape, shape_, ndim * sizeof(shape_elem_type));
-
-    memcpy_event.wait();
-
-    sycl::range<2> gws(size_, kth + 1);
-    auto kernel_parallel_for_func = [=](sycl::id<2> global_id) {
-        size_t j = global_id[0];
-        size_t k = global_id[1];
-
-        _DataType val = arr2[j * shape[ndim - 1] + k];
-
-        for (size_t i = 0; i < static_cast<size_t>(shape[ndim - 1]); ++i) {
-            if (result[j * shape[ndim - 1] + i] == val) {
-                _DataType change_val1 = result[j * shape[ndim - 1] + i];
-                _DataType change_val2 = result[j * shape[ndim - 1] + k];
-                result[j * shape[ndim - 1] + k] = change_val1;
-                result[j * shape[ndim - 1] + i] = change_val2;
-            }
-        }
-    };
-
-    auto kernel_func = [&](sycl::handler &cgh) {
-        cgh.depends_on({memcpy_event});
-        cgh.parallel_for<class dpnp_partition_c_kernel<_DataType>>(
-            gws, kernel_parallel_for_func);
-    };
-
-    auto event = q.submit(kernel_func);
-
-    event.wait();
-
-    delete[] matrix;
-    sycl::free(shape, q);
-
     return event_ref;
 }
 

@@ -1459,35 +1459,54 @@ def nonzero(self):
 
     def partition(self, /, kth, axis=-1, kind="introselect", order=None):
         """
-        Return a partitioned copy of an array.
+        Partially sorts the elements in the array in such a way that the value
+        of the element in k-th position is in the position it would be in a
+        sorted array. In the output array, all elements smaller than the k-th
+        element are located to the left of this element and all equal or
+        greater are located to its right. The ordering of the elements in the
+        two partitions on the either side of the k-th element in the output
+        array is undefined.
 
-        Rearranges the elements in the array in such a way that the value of
-        the element in `kth` position is in the position it would be in
-        a sorted array.
+        Refer to `dpnp.partition` for full documentation.
 
-        All elements smaller than the `kth` element are moved before this
-        element and all equal or greater are moved behind it. The ordering
-        of the elements in the two partitions is undefined.
+        kth : {int, sequence of ints}
+            Element index to partition by. The kth element value will be in its
+            final sorted position and all smaller elements will be moved before
+            it and all equal or greater elements behind it.
+            The order of all elements in the partitions is undefined. If
+            provided with a sequence of kth it will partition all elements
+            indexed by kth of them into their sorted position at once.
+        axis : int, optional
+            Axis along which to sort. The default is ``-1``, which means sort
+            along the the last axis.
 
-        Refer to `dpnp.partition` for full documentation.
+            Default: ``-1``.
 
         See Also
         --------
         :obj:`dpnp.partition` : Return a partitioned copy of an array.
+        :obj:`dpnp.argpartition` : Indirect partition.
+        :obj:`dpnp.sort` : Full sort.
 
         Examples
         --------
         >>> import dpnp as np
         >>> a = np.array([3, 4, 2, 1])
         >>> a.partition(3)
         >>> a
+        array([1, 2, 3, 4]) # may vary
+
+        >>> a.partition((1, 3))
+        >>> a
         array([1, 2, 3, 4])
 
         """
 
-        self._array_obj = dpnp.partition(
-            self, kth, axis=axis, kind=kind, order=order
-        ).get_array()
+        if axis is None:
+            raise TypeError(
+                "'NoneType' object cannot be interpreted as an integer"
+            )
+        self[...] = dpnp.partition(self, kth, axis=axis, kind=kind, order=order)
 
     def prod(
         self,

@@ -39,8 +39,9 @@
 
 """
 
+from collections.abc import Sequence
+
 import dpctl.tensor as dpt
-import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
 import dpnp
@@ -51,7 +52,6 @@
 )
 from .dpnp_array import dpnp_array
 from .dpnp_utils import (
-    call_origin,
     map_dtype_to_device,
 )
 
@@ -147,7 +147,7 @@ def argsort(
 
     Limitations
     -----------
-    Parameters `order` is only supported with its default value.
+    Parameter `order` is only supported with its default value.
     Otherwise ``NotImplementedError`` exception will be raised.
     Sorting algorithms ``"quicksort"`` and ``"heapsort"`` are not supported.
 
@@ -201,44 +201,128 @@ def argsort(
     )
 
 
-def partition(x1, kth, axis=-1, kind="introselect", order=None):
+def partition(a, kth, axis=-1, kind="introselect", order=None):
     """
     Return a partitioned copy of an array.
 
     For full documentation refer to :obj:`numpy.partition`.
 
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        Array to be sorted.
+    kth : {int, sequence of ints}
+        Element index to partition by. The k-th value of the element will be in
+        its final sorted position and all smaller elements will be moved before
+        it and all equal or greater elements behind it. The order of all
+        elements in the partitions is undefined. If provided with a sequence of
+        k-th it will partition all elements indexed by k-th of them into their
+        sorted position at once.
+    axis : {None, int}, optional
+        Axis along which to sort. If ``None``, the array is flattened before
+        sorting. The default is ``-1``, which sorts along the last axis.
+
+        Default: ``-1``.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of the same type and shape as `a`.
+
     Limitations
     -----------
-    Input array is supported as :obj:`dpnp.ndarray`.
-    Input `kth` is supported as :obj:`int`.
-    Parameters `axis`, `kind` and `order` are supported only with default
-    values.
+    Parameters `kind` and `order` are only supported with its default value.
+    Otherwise ``NotImplementedError`` exception will be raised.
+
+    See Also
+    --------
+    :obj:`dpnp.ndarray.partition` : Equivalent method.
+    :obj:`dpnp.argpartition` : Indirect partition.
+    :obj:`dpnp.sort` : Full sorting.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.array([7, 1, 7, 7, 1, 5, 7, 2, 3, 2, 6, 2, 3, 0])
+    >>> p = np.partition(a, 4)
+    >>> p
+    array([0, 1, 1, 2, 2, 2, 3, 3, 5, 7, 7, 7, 7, 6]) # may vary
+
+    ``p[4]`` is 2;  all elements in ``p[:4]`` are less than or equal to
+    ``p[4]``, and all elements in ``p[5:]`` are greater than or equal to
+    ``p[4]``. The partition is::
+
+        [0, 1, 1, 2], [2], [2, 3, 3, 5, 7, 7, 7, 7, 6]
+
+    The next example shows the use of multiple values passed to `kth`.
+
+    >>> p2 = np.partition(a, (4, 8))
+    >>> p2
+    array([0, 1, 1, 2, 2, 2, 3, 3, 5, 6, 7, 7, 7, 7])
+
+    ``p2[4]`` is 2  and ``p2[8]`` is 5. All elements in ``p2[:4]`` are less
+    than or equal to ``p2[4]``, all elements in ``p2[5:8]`` are greater than or
+    equal to ``p2[4]`` and less than or equal to ``p2[8]``, and all elements in
+    ``p2[9:]`` are greater than or equal to ``p2[8]``. The partition is::
+
+        [0, 1, 1, 2], [2], [2, 3, 3], [5], [6, 7, 7, 7, 7]
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    if x1_desc:
-        if dpnp.is_cuda_backend(x1_desc.get_array()):  # pragma: no cover
-            raise NotImplementedError(
-                "Running on CUDA is currently not supported"
-            )
+    dpnp.check_supported_arrays_type(a)
 
-        if not isinstance(kth, int):
-            pass
-        elif x1_desc.ndim == 0:
-            pass
-        elif kth >= x1_desc.shape[x1_desc.ndim - 1] or x1_desc.ndim + kth < 0:
-            pass
-        elif axis != -1:
-            pass
-        elif kind != "introselect":
-            pass
-        elif order is not None:
-            pass
-        else:
-            return dpnp_partition(x1_desc, kth, axis, kind, order).get_pyobj()
+    if kind != "introselect":
+        raise NotImplementedError(
+            "`kind` keyword argument is only supported with its default value."
+        )
+    if order is not None:
+        raise NotImplementedError(
+            "`order` keyword argument is only supported with its default value."
+        )
 
-    return call_origin(numpy.partition, x1, kth, axis, kind, order)
+    if axis is None:
+        a = dpnp.ravel(a)
+        axis = -1
+
+    nd = a.ndim
+    axis = normalize_axis_index(axis, nd)
+    length = a.shape[axis]
+
+    if isinstance(kth, int):
+        kth = (kth,)
+    elif not isinstance(kth, Sequence):
+        raise TypeError(
+            f"kth must be int or sequence of ints, but got {type(kth)}"
+        )
+    elif not all(isinstance(k, int) for k in kth):
+        raise TypeError("kth is a sequence, but not all elements are integers")
+
+    nkth = len(kth)
+    if nkth == 0 or a.size == 0:
+        return dpnp.copy(a)
+
+    # validate kth
+    kth = list(kth)
+    for i in range(nkth):
+        if kth[i] < 0:
+            kth[i] += length
+
+        if not 0 <= kth[i] < length:
+            raise ValueError(f"kth(={kth[i]}) out of bounds {length}")
+
+    dt = a.dtype
+    if (
+        nd > 1
+        or nkth > 1
+        or dpnp.issubdtype(dt, dpnp.unsignedinteger)
+        or dt in (dpnp.int8, dpnp.int16)
+        or dpnp.is_cuda_backend(a.get_array())
+    ):
+        # sort is a faster path in case of ndim > 1
+        return dpnp.sort(a, axis=axis)
+
+    desc = dpnp.get_dpnp_descriptor(a, copy_when_nondefault_queue=False)
+    return dpnp_partition(desc, kth[0], axis, kind, order).get_pyobj()
 
 
 def sort(a, axis=-1, kind=None, order=None, *, descending=False, stable=None):