# ties - python min index

## Is there a way to make numpy.argmin() as fast as min()? (2)

I just took a look at the source code, and while I don't fully understand why things are being done the way they are, this is what happens:

1. `np.min` is basically a call to `np.minimum.reduce`.

2. `np.argmin` first moves the axis you want to operate on to the end of the shape tuple, then makes it a contiguous array, which of course triggers a copy of the full array unless the axis was the last one to begin with.

Since a copy is being made, you can get creative and try to instantiate cheaper arrays:

``````a = np.random.rand(1000, 2000)

def fast_argmin_axis_0(a):
matches = np.nonzero((a == np.min(a, axis=0)).ravel())
rows, cols = np.unravel_index(matches, a.shape)
argmin_array = np.empty(a.shape, dtype=np.intp)
argmin_array[cols] = rows
return argmin_array

In : np.argmin(a, axis=0)
Out: array([230, 532, 815, ..., 670, 702, 989], dtype=int64)

In : fast_argmin_axis_0(a)
Out: array([230, 532, 815, ..., 670, 702, 989], dtype=int64)

In : %timeit np.argmin(a, axis=0)
10 loops, best of 3: 27.3 ms per loop

In : %timeit fast_argmin_axis_0(a)
100 loops, best of 3: 15 ms per loop
``````

I wouldn't go as far as calling the current implementation a bug, since there may be good reasons for numpy doing what it does the way it does it, but that this kind of trickery can speed up what should be a highly optimized function, strongly suggests that things could be done better.

I'm trying to find the minimum array indices along one dimension of a very large 2D numpy array. I'm finding that this is very slow (already tried speeding it up with bottleneck, which was only a minimal improvement). However, taking the straight minimum appears to be an order of magnitude faster:

``````import numpy as np
import time

randvals = np.random.rand(3000,160000)
start = time.time()
minval = randvals.min(axis=0)
print "Took {0:.2f} seconds to compute min".format(time.time()-start)
start = time.time()
minindex = np.argmin(randvals,axis=0)
print "Took {0:.2f} seconds to compute argmin".format(time.time()-start)
``````

On my machine this outputs:

``````Took 0.83 seconds to compute min
Took 9.58 seconds to compute argmin
``````

Is there any reason why argmin is so much slower? Is there any way to speed it up to comparable to min?

I was just hitting the same problem, and found the large difference in performance when axis 0 is selected for finding the minimum. As suggested, the problem seems to be related to internally copying the array.

I devised a rather simple-minded workaround using Cython that simultaneously establishes the minimum values and their position in a 2D numpy array along a given axis. Note that for `axis = 0`, the algorithm works on an array of columns (maximum number specified by the constant `blocksize` - here set equivalent to 8 kByte of data) simultaneously to make good use of the cache:

``````%%cython -c=-O2 -c=-march=native

import numpy as np
cimport numpy as np
cimport cython
from libc.stdint cimport uint32_t

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_64_axis_0(uint32_t[:] minloc, double[:] minimum, double[:, :] arr) nogil:
"""
Find the minimum and argmin for a 2D array of 64-bit floats along axis 0
Parameters:
-----------
arr: numpy_array, dtype=np.float64, shape=(m, n)
The array for which the minima should be computed.
minloc: numpy_array, dtype=np.uint32, shape=(n)
Stores the rows where the minima occur for each column.
minimum: numpy_array, dtype=np.float64, shape=(n)
The minima along each column.
"""

# Columns of the matrix are accessed in blocks to increase cache performance.
# Specify the number of columns here:

DEF blocksize = 1024

cdef int i, j, k
cdef double minim[blocksize]
cdef uint32_t minimloc[blocksize]

# Read blocks of data to make good use of the cache

cdef int blocks
blocks  = arr.shape / blocksize
remains = arr.shape % blocksize

for i in xrange(0, blocksize * blocks, blocksize):

for k in xrange(blocksize):
minim[k]    = arr[0, i + k]
minimloc[k] = 0

for j in xrange(1, arr.shape):

for k in xrange(blocksize):

if arr[j, i + k] < minim[k]:
minim[k] = arr[j, i + k]
minimloc[k] = j

for k in xrange(blocksize):
minimum[i + k] = minim[k]
minloc[i + k]  = minimloc[k]

# Work on the final 'incomplete' block

i = blocksize * blocks

for k in xrange(remains):
minim[k]    = arr[0, i + k]
minimloc[k] = 0

for j in xrange(1, arr.shape):

for k in xrange(remains):

if arr[j, i + k] < minim[k]:
minim[k] = arr[j, i + k]
minimloc[k] = j

for k in xrange(remains):
minimum[i + k] = minim[k]
minloc[i + k]  = minimloc[k]

# Done!

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_64_axis_1(uint32_t[:] minloc, double[:] minimum, double[:, :] arr) nogil:
"""
Find the minimum and argmin for a 2D array of 64-bit floats along axis 1
Parameters:
-----------
arr: numpy_array, dtype=np.float64, shape=(m, n)
The array for which the minima should be computed.
minloc: numpy_array, dtype=np.uint32, shape=(m)
Stores the rows where the minima occur for each row.
minimum: numpy_array, dtype=np.float64, shape=(m)
The minima along each row.
"""
cdef int i
cdef int j
cdef double minim
cdef uint32_t minimloc

for i in xrange(arr.shape):
minim = arr[i, 0]
minimloc = 0

for j in xrange(1, arr.shape):
if arr[i, j] < minim:
minim = arr[i, j]
minimloc = j

minimum[i] = minim
minloc[i]  = minimloc

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_32_axis_0(uint32_t[:] minloc, float[:] minimum, float[:, :] arr) nogil:
"""
Find the minimum and argmin for a 2D array of 32-bit floats along axis 0
Parameters:
-----------
arr: numpy_array, dtype=np.float32, shape=(m, n)
The array for which the minima should be computed.
minloc: numpy_array, dtype=np.uint32, shape=(n)
Stores the rows where the minima occur for each column.
minimum: numpy_array, dtype=np.float32, shape=(n)
The minima along each column.
"""

# Columns of the matrix are accessed in blocks to increase cache performance.
# Specify the number of columns here:

DEF blocksize = 2048

cdef int i, j, k
cdef float minim[blocksize]
cdef uint32_t minimloc[blocksize]

# Read blocks of data to make good use of the cache

cdef int blocks
blocks  = arr.shape / blocksize
remains = arr.shape % blocksize

for i in xrange(0, blocksize * blocks, blocksize):

for k in xrange(blocksize):
minim[k]    = arr[0, i + k]
minimloc[k] = 0

for j in xrange(1, arr.shape):

for k in xrange(blocksize):

if arr[j, i + k] < minim[k]:
minim[k] = arr[j, i + k]
minimloc[k] = j

for k in xrange(blocksize):
minimum[i + k] = minim[k]
minloc[i + k]  = minimloc[k]

# Work on the final 'incomplete' block

i = blocksize * blocks

for k in xrange(remains):
minim[k]    = arr[0, i + k]
minimloc[k] = 0

for j in xrange(1, arr.shape):

for k in xrange(remains):

if arr[j, i + k] < minim[k]:
minim[k] = arr[j, i + k]
minimloc[k] = j

for k in xrange(remains):
minimum[i + k] = minim[k]
minloc[i + k]  = minimloc[k]

# Done!

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_32_axis_1(uint32_t[:] minloc, float[:] minimum, float[:, :] arr) nogil:
"""
Find the minimum and argmin for a 2D array of 32-bit floats along axis 1
Parameters:
-----------
arr: numpy_array, dtype=np.float32, shape=(m, n)
The array for which the minima should be computed.
minloc: numpy_array, dtype=np.uint32, shape=(m)
Stores the rows where the minima occur for each row.
minimum: numpy_array, dtype=np.float32, shape=(m)
The minima along each row.
"""
cdef int i
cdef int j
cdef float minim
cdef uint32_t minimloc

for i in xrange(arr.shape):
minim = arr[i, 0]
minimloc = 0

for j in xrange(1, arr.shape):
if arr[i, j] < minim:
minim = arr[i, j]
minimloc = j

minimum[i] = minim
minloc[i]  = minimloc

def Min_Argmin(array_2d, axis = 1):
"""
Find the minima and corresponding locations (argmin) of a two-dimensional
numpy array of floats along a given axis simultaneously, and returns them
as a tuple of arrays (min_2d, argmin_2d).

(Note: float16 arrays will be internally transformed to float32 arrays.)
----------
array_2d: numpy_array, dtype=np.float32 or np.float64, shape=(m, n)
The array for which the minima should be computed.
axis : int, 0 or 1 (default) :
The axis along which minima are computed.
min_2d: numpy_array, dtype=np.uint8, shape=(m) or shape=(n):
The array where the minima along the given axis are stored.
argmin_2d:
The array storing the row/column where the minimum occurs.
"""

# Sanity checks:
if len(array_2d.shape) != 2:
raise IndexError('Min_Argmin: Number of dimensions of array must be 2')

if not (axis == 0 or axis == 1):
raise ValueError('Min_Argmin: Invalid axis specified')

arr_type = array_2d.dtype

if not arr_type in ('float16', 'float32', 'float64'):
raise ValueError('Min_Argmin: Not a float array')

# Transform float16 arrays
if arr_type == 'float16':
array_2d = array_2d.astype(np.float32)

# Run analysis

if arr_type == 'float64':

# Double accuracy

min_array    = np.zeros(array_2d.shape[1 - axis], dtype = np.float64)
argmin_array = np.zeros(array_2d.shape[1 - axis], dtype = np.uint32)

if (axis == 0):
_minargmin_2d_64_axis_0(argmin_array, min_array, array_2d)

else:
_minargmin_2d_64_axis_1(argmin_array, min_array, array_2d)

else:

# Single accuracy

min_array    = np.zeros(array_2d.shape[1 - axis], dtype = np.float32)
argmin_array = np.zeros(array_2d.shape[1 - axis], dtype = np.uint32)

if (axis == 0):
_minargmin_2d_32_axis_0(argmin_array, min_array, array_2d)

else:
_minargmin_2d_32_axis_1(argmin_array, min_array, array_2d)

# Transform back to float16 type as necessary

if arr_type == 'float16':
min_array = min_array.astype(np.float16)

# Return results
return min_array, argmin_array
``````

The code can be placed and compiled in an IPython notebook cell after loading Cython support:

``````%load_ext Cython
``````

and then called in the form:

``````min_array, argmin_array = Min_Argmin(two_dim_array, axis = 0 or 1)
``````

Timing example:

``````random_array = np.random.rand(20000, 20000).astype(np.float32)

%timeit min_array, argmin_array = Min_Argmin(random_array, axis = 0)
%timeit min_array, argmin_array = Min_Argmin(random_array, axis = 1)

1 loops, best of 3: 405 ms per loop
1 loops, best of 3: 307 ms per loop
``````

For comparison:

``````%%timeit
min_array    = random_array.min(axis = 0)
argmin_array = random_array.argmin(axis = 0)

1 loops, best of 3: 10.3 s per loop

%%timeit
min_array    = random_array.min(axis = 1)
argmin_array = random_array.argmin(axis = 1)

1 loops, best of 3: 423 ms per loop
``````

So, there is a significant speedup for `axis = 0` (and still a small advantage for `axis = 1`, if one is interested in both minimum and location). 