Modified

October 19, 2025

import polars as pl
import polars.selectors as cs
print(pl.__version__)

1.6.0

One of the main problems with messy data is: how do you know if it’s messy or not?

We’re going to use the NYC 311 service request dataset again here, since it’s big and a bit unwieldy. If we try to read it in polars, we immediately run into an error. Polars cannot infer the data type of the data in the csv file:

requests = pl.read_csv('../data/311-service-requests.csv')

---------------------------------------------------------------------------

ComputeError                              Traceback (most recent call last)

Cell In[2], line 1
----> 1 requests = pl.read_csv('../data/311-service-requests.csv')


File ~/miniconda3/envs/pcb/lib/python3.12/site-packages/polars/_utils/deprecation.py:91, in deprecate_renamed_parameter.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
     86 @wraps(function)
     87 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     88     _rename_keyword_argument(
     89         old_name, new_name, kwargs, function.__qualname__, version
     90     )
---> 91     return function(*args, **kwargs)


File ~/miniconda3/envs/pcb/lib/python3.12/site-packages/polars/_utils/deprecation.py:91, in deprecate_renamed_parameter.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
     86 @wraps(function)
     87 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     88     _rename_keyword_argument(
     89         old_name, new_name, kwargs, function.__qualname__, version
     90     )
---> 91     return function(*args, **kwargs)


File ~/miniconda3/envs/pcb/lib/python3.12/site-packages/polars/_utils/deprecation.py:91, in deprecate_renamed_parameter.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
     86 @wraps(function)
     87 def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     88     _rename_keyword_argument(
     89         old_name, new_name, kwargs, function.__qualname__, version
     90     )
---> 91     return function(*args, **kwargs)


File ~/miniconda3/envs/pcb/lib/python3.12/site-packages/polars/io/csv/functions.py:496, in read_csv(source, has_header, columns, new_columns, separator, comment_prefix, quote_char, skip_rows, schema, schema_overrides, null_values, missing_utf8_is_empty_string, ignore_errors, try_parse_dates, n_threads, infer_schema, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, use_pyarrow, storage_options, skip_rows_after_header, row_index_name, row_index_offset, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob)
    488 else:
    489     with prepare_file_arg(
    490         source,
    491         encoding=encoding,
   (...)
    494         storage_options=storage_options,
    495     ) as data:
--> 496         df = _read_csv_impl(
    497             data,
    498             has_header=has_header,
    499             columns=columns if columns else projection,
    500             separator=separator,
    501             comment_prefix=comment_prefix,
    502             quote_char=quote_char,
    503             skip_rows=skip_rows,
    504             schema_overrides=schema_overrides,
    505             schema=schema,
    506             null_values=null_values,
    507             missing_utf8_is_empty_string=missing_utf8_is_empty_string,
    508             ignore_errors=ignore_errors,
    509             try_parse_dates=try_parse_dates,
    510             n_threads=n_threads,
    511             infer_schema_length=infer_schema_length,
    512             batch_size=batch_size,
    513             n_rows=n_rows,
    514             encoding=encoding if encoding == "utf8-lossy" else "utf8",
    515             low_memory=low_memory,
    516             rechunk=rechunk,
    517             skip_rows_after_header=skip_rows_after_header,
    518             row_index_name=row_index_name,
    519             row_index_offset=row_index_offset,
    520             sample_size=sample_size,
    521             eol_char=eol_char,
    522             raise_if_empty=raise_if_empty,
    523             truncate_ragged_lines=truncate_ragged_lines,
    524             decimal_comma=decimal_comma,
    525             glob=glob,
    526         )
    528 if new_columns:
    529     return _update_columns(df, new_columns)


File ~/miniconda3/envs/pcb/lib/python3.12/site-packages/polars/io/csv/functions.py:642, in _read_csv_impl(source, has_header, columns, separator, comment_prefix, quote_char, skip_rows, schema, schema_overrides, null_values, missing_utf8_is_empty_string, ignore_errors, try_parse_dates, n_threads, infer_schema_length, batch_size, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob)
    638         raise ValueError(msg)
    640 projection, columns = parse_columns_arg(columns)
--> 642 pydf = PyDataFrame.read_csv(
    643     source,
    644     infer_schema_length,
    645     batch_size,
    646     has_header,
    647     ignore_errors,
    648     n_rows,
    649     skip_rows,
    650     projection,
    651     separator,
    652     rechunk,
    653     columns,
    654     encoding,
    655     n_threads,
    656     path,
    657     dtype_list,
    658     dtype_slice,
    659     low_memory,
    660     comment_prefix,
    661     quote_char,
    662     processed_null_values,
    663     missing_utf8_is_empty_string,
    664     try_parse_dates,
    665     skip_rows_after_header,
    666     parse_row_index_args(row_index_name, row_index_offset),
    667     sample_size=sample_size,
    668     eol_char=eol_char,
    669     raise_if_empty=raise_if_empty,
    670     truncate_ragged_lines=truncate_ragged_lines,
    671     decimal_comma=decimal_comma,
    672     schema=schema,
    673 )
    674 return wrap_df(pydf)


ComputeError: could not parse `11549-3650` as dtype `i64` at column 'Incident Zip' (column number 9)

The current offset in the file is 34985879 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `dtypes` argument
- setting `ignore_errors` to `True`,
- adding `11549-3650` to the `null_values` list.

Original error: ```remaining bytes non-empty```

We can force polars to try harder to infer the data type by setting infer_schema_length to None. Looking at the schema below, I can see that Incident Zip was parsed as a string. That doesn’t look right.

requests = pl.read_csv('../data/311-service-requests.csv', infer_schema_length=None)
display(requests.head())
display(requests.schema)

shape: (5, 52)

Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	i64	i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	f64	f64	str
26589651	"10/31/2013 02:08:41 AM"	null	"NYPD"	"New York City Police Departmen…	"Noise - Street/Sidewalk"	"Loud Talking"	"Street/Sidewalk"	"11432"	"90-03 169 STREET"	"169 STREET"	"90 AVENUE"	"91 AVENUE"	null	null	"ADDRESS"	"JAMAICA"	null	"Precinct"	"Assigned"	"10/31/2013 10:08:41 AM"	"10/31/2013 02:35:17 AM"	"12 QUEENS"	"QUEENS"	1042027	197389	"Unspecified"	"QUEENS"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	40.708275	-73.791604	"(40.70827532593202, -73.791603…
26593698	"10/31/2013 02:01:04 AM"	null	"NYPD"	"New York City Police Departmen…	"Illegal Parking"	"Commercial Overnight Parking"	"Street/Sidewalk"	"11378"	"58 AVENUE"	"58 AVENUE"	"58 PLACE"	"59 STREET"	null	null	"BLOCKFACE"	"MASPETH"	null	"Precinct"	"Open"	"10/31/2013 10:01:04 AM"	null	"05 QUEENS"	"QUEENS"	1009349	201984	"Unspecified"	"QUEENS"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	40.721041	-73.909453	"(40.721040535628305, -73.90945…
26594139	"10/31/2013 02:00:24 AM"	"10/31/2013 02:40:32 AM"	"NYPD"	"New York City Police Departmen…	"Noise - Commercial"	"Loud Music/Party"	"Club/Bar/Restaurant"	"10032"	"4060 BROADWAY"	"BROADWAY"	"WEST 171 STREET"	"WEST 172 STREET"	null	null	"ADDRESS"	"NEW YORK"	null	"Precinct"	"Closed"	"10/31/2013 10:00:24 AM"	"10/31/2013 02:39:42 AM"	"12 MANHATTAN"	"MANHATTAN"	1001088	246531	"Unspecified"	"MANHATTAN"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	40.84333	-73.939144	"(40.84332975466513, -73.939143…
26595721	"10/31/2013 01:56:23 AM"	"10/31/2013 02:21:48 AM"	"NYPD"	"New York City Police Departmen…	"Noise - Vehicle"	"Car/Truck Horn"	"Street/Sidewalk"	"10023"	"WEST 72 STREET"	"WEST 72 STREET"	"COLUMBUS AVENUE"	"AMSTERDAM AVENUE"	null	null	"BLOCKFACE"	"NEW YORK"	null	"Precinct"	"Closed"	"10/31/2013 09:56:23 AM"	"10/31/2013 02:21:10 AM"	"07 MANHATTAN"	"MANHATTAN"	989730	222727	"Unspecified"	"MANHATTAN"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	40.778009	-73.980213	"(40.7780087446372, -73.9802134…
26590930	"10/31/2013 01:53:44 AM"	null	"DOHMH"	"Department of Health and Menta…	"Rodent"	"Condition Attracting Rodents"	"Vacant Lot"	"10027"	"WEST 124 STREET"	"WEST 124 STREET"	"LENOX AVENUE"	"ADAM CLAYTON POWELL JR BOULEVA…	null	null	"BLOCKFACE"	"NEW YORK"	null	"N/A"	"Pending"	"11/30/2013 01:53:44 AM"	"10/31/2013 01:59:54 AM"	"10 MANHATTAN"	"MANHATTAN"	998815	233545	"Unspecified"	"MANHATTAN"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	40.807691	-73.947387	"(40.80769092704951, -73.947387…

Schema([('Unique Key', Int64),
        ('Created Date', String),
        ('Closed Date', String),
        ('Agency', String),
        ('Agency Name', String),
        ('Complaint Type', String),
        ('Descriptor', String),
        ('Location Type', String),
        ('Incident Zip', String),
        ('Incident Address', String),
        ('Street Name', String),
        ('Cross Street 1', String),
        ('Cross Street 2', String),
        ('Intersection Street 1', String),
        ('Intersection Street 2', String),
        ('Address Type', String),
        ('City', String),
        ('Landmark', String),
        ('Facility Type', String),
        ('Status', String),
        ('Due Date', String),
        ('Resolution Action Updated Date', String),
        ('Community Board', String),
        ('Borough', String),
        ('X Coordinate (State Plane)', Int64),
        ('Y Coordinate (State Plane)', Int64),
        ('Park Facility Name', String),
        ('Park Borough', String),
        ('School Name', String),
        ('School Number', String),
        ('School Region', String),
        ('School Code', String),
        ('School Phone Number', String),
        ('School Address', String),
        ('School City', String),
        ('School State', String),
        ('School Zip', String),
        ('School Not Found', String),
        ('School or Citywide Complaint', String),
        ('Vehicle Type', String),
        ('Taxi Company Borough', String),
        ('Taxi Pick Up Location', String),
        ('Bridge Highway Name', String),
        ('Bridge Highway Direction', String),
        ('Road Ramp', String),
        ('Bridge Highway Segment', String),
        ('Garage Lot Name', String),
        ('Ferry Direction', String),
        ('Ferry Terminal Name', String),
        ('Latitude', Float64),
        ('Longitude', Float64),
        ('Location', String)])

7.1 How do we know if it’s messy?

We’re going to look at a few columns here. I know already that there are some problems with the zip code, so let’s look at that first.

To get a sense for whether a column has problems, I usually use .unique() to look at all its values. If it’s a numeric column, I’ll instead plot a histogram to get a sense of the distribution.

When we look at the unique values in “Incident Zip”, it quickly becomes clear that this is a mess.

Some of the problems:

Some have been parsed as strings, and some as floats
There are nans
Some of the zip codes are 29616-0759 or 83
There are some N/A values that polars didn’t recognize, like ‘N/A’ and ‘NO CLUE’

What we can do:

Normalize ‘N/A’ and ‘NO CLUE’ into regular nan values
Look at what’s up with the 83, and decide what to do
Make everything strings

requests['Incident Zip'].unique().sort()

shape: (251,)

Incident Zip
str
null
"00000"
"000000"
"00083"
"02061"
…
"90010"
"92123"
"N/A"
"NA"
"NO CLUE"

7.2 Fixing the null_values and string/float confusion

We can pass a null_values option to pl.read_csv to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.

null_values = ['NO CLUE', 'N/A', '0', 'NA']
requests = pl.read_csv('../data/311-service-requests.csv', null_values=null_values, dtypes={'Incident Zip':pl.String})
requests['Incident Zip'].unique().sort()

/var/folders/sz/c22f1dwn4pz41534xrybbydc0000gn/T/ipykernel_26170/480128266.py:2: DeprecationWarning: The argument `dtypes` for `read_csv` is deprecated. It has been renamed to `schema_overrides`.
  requests = pl.read_csv('../data/311-service-requests.csv', null_values=null_values, dtypes={'Incident Zip':pl.String})

shape: (248,)

Incident Zip
str
null
"00000"
"000000"
"00083"
"02061"
…
"70711"
"77056"
"77092-2016"
"90010"
"92123"

7.3 What’s up with the dashes?

rows_with_dashes = requests.filter(
    pl.col('Incident Zip').str.contains('-')
)
print('number of zip codes with dashes: ', rows_with_dashes.height)
rows_with_dashes.head()

number of zip codes with dashes:  5

shape: (5, 52)

Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	i64	i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	f64	f64	str
26550551	"10/24/2013 06:16:34 PM"	null	"DCA"	"Department of Consumer Affairs"	"Consumer Complaint"	"False Advertising"	null	"77092-2016"	"2700 EAST SELTICE WAY"	"EAST SELTICE WAY"	null	null	null	null	null	"HOUSTON"	null	null	"Assigned"	"11/13/2013 11:15:20 AM"	"10/29/2013 11:16:16 AM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	null	null	null
26548831	"10/24/2013 09:35:10 AM"	null	"DCA"	"Department of Consumer Affairs"	"Consumer Complaint"	"Harassment"	null	"55164-0737"	"P.O. BOX 64437"	"64437"	null	null	null	null	null	"ST. PAUL"	null	null	"Assigned"	"11/13/2013 02:30:21 PM"	"10/29/2013 02:31:06 PM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	null	null	null
26488417	"10/15/2013 03:40:33 PM"	null	"TLC"	"Taxi and Limousine Commission"	"Taxi Complaint"	"Driver Complaint"	"Street"	"11549-3650"	"365 HOFSTRA UNIVERSITY"	"HOFSTRA UNIVERSITY"	null	null	null	null	null	"HEMSTEAD"	null	null	"Assigned"	"11/30/2013 01:20:33 PM"	"10/16/2013 01:21:39 PM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	"La Guardia Airport"	null	null	null	null	null	null	null	null	null	null
26468296	"10/10/2013 12:36:43 PM"	"10/26/2013 01:07:07 AM"	"DCA"	"Department of Consumer Affairs"	"Consumer Complaint"	"Debt Not Owed"	null	"29616-0759"	"PO BOX 25759"	"BOX 25759"	null	null	null	null	null	"GREENVILLE"	null	null	"Closed"	"10/26/2013 09:20:28 AM"	"10/26/2013 01:07:07 AM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	null	null	null
26461137	"10/09/2013 05:23:46 PM"	"10/25/2013 01:06:41 AM"	"DCA"	"Department of Consumer Affairs"	"Consumer Complaint"	"Harassment"	null	"35209-3114"	"600 BEACON PKWY"	"BEACON PKWY"	null	null	null	null	null	"BIRMINGHAM"	null	null	"Closed"	"10/25/2013 02:43:42 PM"	"10/25/2013 01:06:41 AM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	null	null	null	null	null	null	null	null	null	null	null

I thought these were missing data and originally deleted them. But then my friend Dave pointed out that 9-digit zip codes are normal. Let’s look at all the zip codes with more than 5 digits, make sure they’re okay, and then truncate them.

requests.filter(
    pl.col('Incident Zip').str.contains('-')
)['Incident Zip'].unique()

shape: (5,)

Incident Zip
str
"77092-2016"
"11549-3650"
"55164-0737"
"35209-3114"
"29616-0759"

Those all look okay to truncate to me.

requests = requests.with_columns(
    pl.col('Incident Zip').str.slice(0, 5)
)
requests.filter(
    pl.col('Incident Zip').str.contains('-')
)['Incident Zip'].unique()

shape: (0,)

Incident Zip
str

Done.

Earlier I thought 00083 was a broken zip code, but turns out Central Park’s zip code 00083! Shows what I know. I’m still concerned about the 00000 zip codes, though: let’s look at that.

requests.filter(
    pl.col('Incident Zip') == '00000'
)

shape: (2, 52)

Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	i64	i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	f64	f64	str
26529313	"10/22/2013 02:51:06 PM"	null	"TLC"	"Taxi and Limousine Commission"	"Taxi Complaint"	"Driver Complaint"	null	"00000"	"EWR EWR"	"EWR"	null	null	null	null	null	"NEWARK"	null	null	"Assigned"	"12/07/2013 09:53:51 AM"	"10/23/2013 09:54:43 AM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	"Other"	null	null	null	null	null	null	null	null	null	null
26507389	"10/17/2013 05:48:44 PM"	null	"TLC"	"Taxi and Limousine Commission"	"Taxi Complaint"	"Driver Complaint"	"Street"	"00000"	"1 NEWARK AIRPORT"	"NEWARK AIRPORT"	null	null	null	null	null	"NEWARK"	null	null	"Assigned"	"12/02/2013 11:59:46 AM"	"10/18/2013 12:01:08 PM"	"0 Unspecified"	"Unspecified"	null	null	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"Unspecified"	"N"	null	null	null	"Other"	null	null	null	null	null	null	null	null	null	null

This looks bad to me. Let’s set these to nan.

requests = requests.with_columns(
    pl.when(pl.col('Incident Zip') == '00000').then(None).otherwise(pl.col('Incident Zip')).alias('Incident Zip')
)
requests.filter(
    pl.col('Incident Zip') == '00000'
)

shape: (0, 52)

Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	i64	i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	f64	f64	str

Great. Let’s see where we are now:

unique_zips = requests['Incident Zip'].unique().sort()
unique_zips

shape: (246,)

Incident Zip
str
null
"00083"
"02061"
"06901"
"07020"
…
"70711"
"77056"
"77092"
"90010"
"92123"

Amazing! This is much cleaner. There’s something a bit weird here, though – I looked up 77056 on Google maps, and that’s in Texas.

Let’s take a closer look:

requests.lazy().select(
    'Incident Zip',
    'Descriptor',
    'City'
).filter(
    pl.col('Incident Zip') == "77056"
).sort('Incident Zip').collect()

shape: (1, 3)

Incident Zip	Descriptor	City
str	str	str
"77056"	"Debt Not Owed"	"HOUSTON"

Okay, there really are requests coming from Houston! Good to know. Filtering by zip code is probably a bad way to handle this – we should really be looking at the city instead.

requests['City'].str.to_uppercase().value_counts(sort=True)

shape: (101, 2)

City	count
str	u32
"BROOKLYN"	31662
"NEW YORK"	22664
"BRONX"	18438
null	12215
"STATEN ISLAND"	4766
…	…
"SYRACUSE"	1
"NANUET"	1
"FARMINGDALE"	1
"NEW YOR"	1
"NEWARK AIRPORT"	1

There are 12,215 null values in the City column. Upon closer look, it seems that many of these rows also have missing Incident Zip values as well:

requests.select(
    'Incident Zip',
    'Descriptor',
    'City'
).filter(
    pl.col('City').is_null()
).sort('Incident Zip')

shape: (12_215, 3)

Incident Zip	Descriptor	City
str	str	str
null	"Street Light Out"	null
null	"Street Light Out"	null
null	"Medicaid"	null
null	"Controller"	null
null	"Property Tax Exemption Applica…	null
…	…	…
null	"Street Light Out"	null
null	"Street Light Out"	null
null	"Property Tax Exemption Applica…	null
"10022"	"Driver Complaint"	null
"11429"	"Dead Animal"	null

7.4 Putting it together

Here’s what we ended up doing to clean up our zip codes, all together:

null_values = ['NO CLUE', 'N/A', '0', 'NA']
requests = (
    pl.scan_csv('../data/311-service-requests.csv', null_values=null_values, schema_overrides={'Incident Zip':pl.String})
    .with_columns(pl.col('Incident Zip').str.slice(0, 5))
)
requests = (
    requests
    .with_columns(pl.when(pl.col('Incident Zip') == '00000').then(None).otherwise(pl.col('Incident Zip')).alias('Incident Zip'))
    .filter(pl.col('Incident Zip').is_not_null())
    .collect()
)

requests['Incident Zip'].unique().sort()

shape: (245,)

Incident Zip
str
"00083"
"02061"
"06901"
"07020"
"07087"
…
"70711"
"77056"
"77092"
"90010"
"92123"