@@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
118118@cython.wraparound(False)
119119@cython.boundscheck(False)
120120{{if dtype == 'object'}}
121- cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
121+ cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None ):
122122{{else}}
123- cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
123+ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None ):
124124{{endif}}
125125 cdef:
126126 int ret = 0
@@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
129129 {{else}}
130130 PyObject* value
131131 {{endif}}
132- Py_ssize_t i, n = len(values)
132+ Py_ssize_t i, n = len(values), first_na = -1
133133 khiter_t k
134134 kh_{{ttype}}_t *table = kh_init_{{ttype}}()
135135 ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
136+ bint seen_na = False, uses_mask = mask is not None
137+ bint seen_multiple_na = False
136138
137139 kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
138140
@@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
147149 {{endif}}
148150 for i in range(n - 1, -1, -1):
149151 # equivalent: range(n)[::-1], which cython doesn't like in nogil
150- value = {{to_c_type}}(values[i])
151- kh_put_{{ttype}}(table, value, &ret)
152- out[i] = ret == 0
152+ if uses_mask and mask[i]:
153+ if seen_na:
154+ out[i] = True
155+ else:
156+ out[i] = False
157+ seen_na = True
158+ else:
159+ value = {{to_c_type}}(values[i])
160+ kh_put_{{ttype}}(table, value, &ret)
161+ out[i] = ret == 0
153162
154163 elif keep == 'first':
155164 {{if dtype == 'object'}}
@@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
158167 with nogil:
159168 {{endif}}
160169 for i in range(n):
161- value = {{to_c_type}}(values[i])
162- kh_put_{{ttype}}(table, value, &ret)
163- out[i] = ret == 0
170+ if uses_mask and mask[i]:
171+ if seen_na:
172+ out[i] = True
173+ else:
174+ out[i] = False
175+ seen_na = True
176+ else:
177+ value = {{to_c_type}}(values[i])
178+ kh_put_{{ttype}}(table, value, &ret)
179+ out[i] = ret == 0
164180
165181 else:
166182 {{if dtype == 'object'}}
@@ -169,15 +185,28 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
169185 with nogil:
170186 {{endif}}
171187 for i in range(n):
172- value = {{to_c_type}}(values[i])
173- k = kh_get_{{ttype}}(table, value)
174- if k != table.n_buckets:
175- out[table.vals[k]] = 1
176- out[i] = 1
188+ if uses_mask and mask[i]:
189+ if not seen_na:
190+ first_na = i
191+ seen_na = True
192+ out[i] = 0
193+ elif not seen_multiple_na:
194+ out[i] = 1
195+ out[first_na] = 1
196+ seen_multiple_na = True
197+ else:
198+ out[i] = 1
199+
177200 else:
178- k = kh_put_{{ttype}}(table, value, &ret)
179- table.vals[k] = i
180- out[i] = 0
201+ value = {{to_c_type}}(values[i])
202+ k = kh_get_{{ttype}}(table, value)
203+ if k != table.n_buckets:
204+ out[table.vals[k]] = 1
205+ out[i] = 1
206+ else:
207+ k = kh_put_{{ttype}}(table, value, &ret)
208+ table.vals[k] = i
209+ out[i] = 0
181210
182211 kh_destroy_{{ttype}}(table)
183212 return out
@@ -301,37 +330,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N
301330 raise TypeError(values.dtype)
302331
303332
304- cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
333+ cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None ):
305334 if htfunc_t is object:
306- return duplicated_object(values, keep)
335+ return duplicated_object(values, keep, mask=mask )
307336
308337 elif htfunc_t is int8_t:
309- return duplicated_int8(values, keep)
338+ return duplicated_int8(values, keep, mask=mask )
310339 elif htfunc_t is int16_t:
311- return duplicated_int16(values, keep)
340+ return duplicated_int16(values, keep, mask=mask )
312341 elif htfunc_t is int32_t:
313- return duplicated_int32(values, keep)
342+ return duplicated_int32(values, keep, mask=mask )
314343 elif htfunc_t is int64_t:
315- return duplicated_int64(values, keep)
344+ return duplicated_int64(values, keep, mask=mask )
316345
317346 elif htfunc_t is uint8_t:
318- return duplicated_uint8(values, keep)
347+ return duplicated_uint8(values, keep, mask=mask )
319348 elif htfunc_t is uint16_t:
320- return duplicated_uint16(values, keep)
349+ return duplicated_uint16(values, keep, mask=mask )
321350 elif htfunc_t is uint32_t:
322- return duplicated_uint32(values, keep)
351+ return duplicated_uint32(values, keep, mask=mask )
323352 elif htfunc_t is uint64_t:
324- return duplicated_uint64(values, keep)
353+ return duplicated_uint64(values, keep, mask=mask )
325354
326355 elif htfunc_t is float64_t:
327- return duplicated_float64(values, keep)
356+ return duplicated_float64(values, keep, mask=mask )
328357 elif htfunc_t is float32_t:
329- return duplicated_float32(values, keep)
358+ return duplicated_float32(values, keep, mask=mask )
330359
331360 elif htfunc_t is complex128_t:
332- return duplicated_complex128(values, keep)
361+ return duplicated_complex128(values, keep, mask=mask )
333362 elif htfunc_t is complex64_t:
334- return duplicated_complex64(values, keep)
363+ return duplicated_complex64(values, keep, mask=mask )
335364
336365 else:
337366 raise TypeError(values.dtype)
0 commit comments