1717 zip , string_types , map , u )
1818from pandas .types .common import (is_integer , _ensure_object ,
1919 is_list_like , is_integer_dtype ,
20- is_float ,
21- is_scalar )
20+ is_float , is_dtype_equal ,
21+ is_object_dtype ,
22+ is_scalar , is_categorical_dtype )
23+ from pandas .types .missing import isnull
24+ from pandas .types .cast import _astype_nansafe
2225from pandas .core .index import Index , MultiIndex , RangeIndex
2326from pandas .core .series import Series
2427from pandas .core .frame import DataFrame
28+ from pandas .core .categorical import Categorical
2529from pandas .core .common import AbstractMethodError
2630from pandas .core .config import get_option
2731from pandas .io .date_converters import generic_parser
111115 are duplicate names in the columns.
112116dtype : Type name or dict of column -> type, default None
113117 Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
114- (Unsupported with engine='python'). Use `str` or `object` to preserve and
115- not interpret dtype.
118+ Use `str` or `object` to preserve and not interpret dtype.
119+ If converters are specified, they will be applied INSTEAD
120+ of dtype conversion.
116121%s
117122converters : dict, default None
118123 Dict of functions for converting values in certain columns. Keys can either
@@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds):
421426 'true_values' : None ,
422427 'false_values' : None ,
423428 'converters' : None ,
429+ 'dtype' : None ,
424430 'skipfooter' : 0 ,
425431
426432 'keep_default_na' : True ,
@@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds):
461467 'buffer_lines' : None ,
462468 'error_bad_lines' : True ,
463469 'warn_bad_lines' : True ,
464- 'dtype' : None ,
465470 'float_precision' : None
466471}
467472
@@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds):
476481 'buffer_lines' ,
477482 'error_bad_lines' ,
478483 'warn_bad_lines' ,
479- 'dtype' ,
480484 'float_precision' ,
481485])
482486_deprecated_args = set ([
@@ -834,9 +838,6 @@ def _clean_options(self, options, engine):
834838 " ignored as it is not supported by the 'python'"
835839 " engine." ).format (reason = fallback_reason ,
836840 option = arg )
837- if arg == 'dtype' :
838- msg += " (Note the 'converters' option provides" \
839- " similar functionality.)"
840841 raise ValueError (msg )
841842 del result [arg ]
842843
@@ -1285,36 +1286,59 @@ def _agg_index(self, index, try_parse_dates=True):
12851286 col_na_values , col_na_fvalues = _get_na_values (
12861287 col_name , self .na_values , self .na_fvalues )
12871288
1288- arr , _ = self ._convert_types (arr , col_na_values | col_na_fvalues )
1289+ arr , _ = self ._infer_types (arr , col_na_values | col_na_fvalues )
12891290 arrays .append (arr )
12901291
12911292 index = MultiIndex .from_arrays (arrays , names = self .index_names )
12921293
12931294 return index
12941295
12951296 def _convert_to_ndarrays (self , dct , na_values , na_fvalues , verbose = False ,
1296- converters = None ):
1297+ converters = None , dtypes = None ):
12971298 result = {}
12981299 for c , values in compat .iteritems (dct ):
12991300 conv_f = None if converters is None else converters .get (c , None )
1301+ if isinstance (dtypes , dict ):
1302+ cast_type = dtypes .get (c , None )
1303+ else :
1304+ # single dtype or None
1305+ cast_type = dtypes
13001306
13011307 if self .na_filter :
13021308 col_na_values , col_na_fvalues = _get_na_values (
13031309 c , na_values , na_fvalues )
13041310 else :
13051311 col_na_values , col_na_fvalues = set (), set ()
13061312
1307- coerce_type = True
13081313 if conv_f is not None :
1314+ # conv_f applied to data before inference
1315+ if cast_type is not None :
1316+ warnings .warn (("Both a converter and dtype were specified "
1317+ "for column {0} - only the converter will "
1318+ "be used" ).format (c ), ParserWarning ,
1319+ stacklevel = 7 )
1320+
13091321 try :
13101322 values = lib .map_infer (values , conv_f )
13111323 except ValueError :
13121324 mask = lib .ismember (values , na_values ).view (np .uint8 )
13131325 values = lib .map_infer_mask (values , conv_f , mask )
1314- coerce_type = False
13151326
1316- cvals , na_count = self ._convert_types (
1317- values , set (col_na_values ) | col_na_fvalues , coerce_type )
1327+ cvals , na_count = self ._infer_types (
1328+ values , set (col_na_values ) | col_na_fvalues ,
1329+ try_num_bool = False )
1330+ else :
1331+ # skip inference if specified dtype is object
1332+ try_num_bool = not (cast_type and is_object_dtype (cast_type ))
1333+
1334+ # general type inference and conversion
1335+ cvals , na_count = self ._infer_types (
1336+ values , set (col_na_values ) | col_na_fvalues ,
1337+ try_num_bool )
1338+
1339+ # type specificed in dtype param
1340+ if cast_type and not is_dtype_equal (cvals , cast_type ):
1341+ cvals = self ._cast_types (cvals , cast_type , c )
13181342
13191343 if issubclass (cvals .dtype .type , np .integer ) and self .compact_ints :
13201344 cvals = lib .downcast_int64 (
@@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13261350 print ('Filled %d NA values in column %s' % (na_count , str (c )))
13271351 return result
13281352
1329- def _convert_types (self , values , na_values , try_num_bool = True ):
1353+ def _infer_types (self , values , na_values , try_num_bool = True ):
1354+ """
1355+ Infer types of values, possibly casting
1356+
1357+ Parameters
1358+ ----------
1359+ values : ndarray
1360+ na_values : set
1361+ try_num_bool : bool, default try
1362+ try to cast values to numeric (first preference) or boolean
1363+
1364+ Returns:
1365+ --------
1366+ converted : ndarray
1367+ na_count : int
1368+ """
1369+
13301370 na_count = 0
13311371 if issubclass (values .dtype .type , (np .number , np .bool_ )):
13321372 mask = lib .ismember (values , na_values )
@@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13401380 if try_num_bool :
13411381 try :
13421382 result = lib .maybe_convert_numeric (values , na_values , False )
1383+ na_count = isnull (result ).sum ()
13431384 except Exception :
13441385 result = values
13451386 if values .dtype == np .object_ :
@@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13561397
13571398 return result , na_count
13581399
1400+ def _cast_types (self , values , cast_type , column ):
1401+ """
1402+ Cast values to specified type
1403+
1404+ Parameters
1405+ ----------
1406+ values : ndarray
1407+ cast_type : string or np.dtype
1408+ dtype to cast values to
1409+ column : string
1410+ column name - used only for error reporting
1411+
1412+ Returns
1413+ -------
1414+ converted : ndarray
1415+ """
1416+
1417+ if is_categorical_dtype (cast_type ):
1418+ # XXX this is for consistency with
1419+ # c-parser which parses all categories
1420+ # as strings
1421+ if not is_object_dtype (values ):
1422+ values = _astype_nansafe (values , str )
1423+ values = Categorical (values )
1424+ else :
1425+ try :
1426+ values = _astype_nansafe (values , cast_type , copy = True )
1427+ except ValueError :
1428+ raise ValueError ("Unable to convert column %s to "
1429+ "type %s" % (column , cast_type ))
1430+ return values
1431+
13591432 def _do_date_conversions (self , names , data ):
13601433 # returns data, columns
13611434 if self .parse_dates is not None :
@@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds):
17841857
17851858 self .verbose = kwds ['verbose' ]
17861859 self .converters = kwds ['converters' ]
1860+ self .dtype = kwds ['dtype' ]
17871861
17881862 self .compact_ints = kwds ['compact_ints' ]
17891863 self .use_unsigned = kwds ['use_unsigned' ]
@@ -1982,7 +2056,7 @@ def read(self, rows=None):
19822056 # DataFrame with the right metadata, even though it's length 0
19832057 names = self ._maybe_dedup_names (self .orig_names )
19842058 index , columns , col_dict = _get_empty_meta (
1985- names , self .index_col , self .index_names )
2059+ names , self .index_col , self .index_names , self . dtype )
19862060 columns = self ._maybe_make_multi_index_columns (
19872061 columns , self .col_names )
19882062 return index , columns , col_dict
@@ -2033,15 +2107,25 @@ def get_chunk(self, size=None):
20332107
20342108 def _convert_data (self , data ):
20352109 # apply converters
2036- clean_conv = {}
2037-
2038- for col , f in compat .iteritems (self .converters ):
2039- if isinstance (col , int ) and col not in self .orig_names :
2040- col = self .orig_names [col ]
2041- clean_conv [col ] = f
2110+ def _clean_mapping (mapping ):
2111+ "converts col numbers to names"
2112+ clean = {}
2113+ for col , v in compat .iteritems (mapping ):
2114+ if isinstance (col , int ) and col not in self .orig_names :
2115+ col = self .orig_names [col ]
2116+ clean [col ] = v
2117+ return clean
2118+
2119+ clean_conv = _clean_mapping (self .converters )
2120+ if not isinstance (self .dtype , dict ):
2121+ # handles single dtype applied to all columns
2122+ clean_dtypes = self .dtype
2123+ else :
2124+ clean_dtypes = _clean_mapping (self .dtype )
20422125
20432126 return self ._convert_to_ndarrays (data , self .na_values , self .na_fvalues ,
2044- self .verbose , clean_conv )
2127+ self .verbose , clean_conv ,
2128+ clean_dtypes )
20452129
20462130 def _to_recarray (self , data , columns ):
20472131 dtypes = []
0 commit comments