DOB Complaints

Xiong Zheng

In this project, we'll explore the DOB complaints in New York City.

1 Data Wrangling

In this section, we will read and wrangle the dataset:

Merge 'Complaint Category' and 'Complaint Category Description'
Time Arrangement
Allocate borough

In [83]:
import pandas as pd
import time
import seaborn as sns
import altair as alt  
from datetime import datetime, timedelta, date, time
from matplotlib import pyplot as plt
pd.options.mode.chained_assignment = None

DOBCR = pd.read_csv("DOB_Complaints_Received.csv")
DOBCR = DOBCR.sort_values(["Date Entered"],ascending = False)

DOBCatalog = pd.read_csv("complaint_category.csv")
DOBCatalog = DOBCatalog.rename(columns={'CODE':'Complaint Category'})

# Delete rows of 'Active' complaints
delete = DOBCR.loc[(DOBCR['Status']=="ACTIVE")]
DOBCR = DOBCR.drop(delete.index)

# Delete unwanted columns
drop = pd.Series(['Status','House Number','House Street','BIN','Special District','Disposition Code','DOBRunDate','Inspection Date'])
DOBCR2 = DOBCR.drop(drop, axis=1)

# Merge 'Complaint Category' and 'Complaint Category Description'
DOBCR3 = DOBCR2.merge(DOBCatalog,on=['Complaint Category'], how='left')

# Time Arrangement
DOBCR3['Date Entered'] = pd.to_datetime(DOBCR3['Date Entered'])
DOBCR3['Disposition Date'] = pd.to_datetime(DOBCR3['Disposition Date'])

mask = (DOBCR3['Date Entered']> '2020-01-01')
T_DOBCR = DOBCR3.loc[mask]
T_DOBCR['counter'] = float(1)
T_DOBCR = T_DOBCR.sort_values(['Disposition Date'],ascending = True)

T_DOBCR ['Month_DEntered'] = T_DOBCR ['Date Entered'].dt.strftime("%m")
T_DOBCR ['Y_DEntered'] = T_DOBCR ['Date Entered'].dt.strftime("%Y")

T_DOBCR['ReactionTime'] = (T_DOBCR['Disposition Date'] - T_DOBCR['Date Entered']).dt.days
T_DOBCR['ReactionTime'] = T_DOBCR['ReactionTime'] + int(1)

# Allocate borough
T_DOBCR.loc[T_DOBCR['Complaint Number']<=5999999, 'borough'] = 'Staten Island'
T_DOBCR.loc[T_DOBCR['Complaint Number']<=4999999, 'borough'] = 'Queen'
T_DOBCR.loc[T_DOBCR['Complaint Number']<=3999999, 'borough'] = 'Brooklyn'
T_DOBCR.loc[T_DOBCR['Complaint Number']<=2999999, 'borough'] = 'Bronx'
T_DOBCR.loc[T_DOBCR['Complaint Number']<=1999999, 'borough'] = 'Manhattan'

2 'Different Complaint Type' & 'Occurrence Frequency' in Different Region

From the graph, we can see that Bronx County has relatively concerntrated complaints on '6s' and '6M' types. Brooklyn, Manhattan and Queen counties have realtively larger number of complaints and similar complaint distributions, concerntrating on 04,1X, 2N, 6S, 7G, and 8A types.

In [86]:
DDsumDOB = T_DOBCR.groupby(["COMPLAINT CATEGORY DESCRIPTION","Y_DEntered",'borough'],as_index=False).sum('counter').sort_values(["counter"],ascending = False)
DDsumDOB2 = T_DOBCR.groupby(["COMPLAINT CATEGORY DESCRIPTION",'borough'],as_index=False).sum('counter').sort_values(["counter"],ascending = False)
DDsumDOB = DDsumDOB.drop('ReactionTime',axis=1)
DDsumDOB2 = DDsumDOB2.drop('ReactionTime',axis=1)

DDsumDOB = DDsumDOB.merge(DOBCatalog,on = ['COMPLAINT CATEGORY DESCRIPTION'], how='left')
DDsumDOB2 = DDsumDOB2.merge(DOBCatalog,on = ['COMPLAINT CATEGORY DESCRIPTION'], how='left')

Bronx = DDsumDOB.loc[DDsumDOB.borough=='Bronx']
Brooklyn = DDsumDOB.loc[DDsumDOB.borough=='Brooklyn']
Manhattan = DDsumDOB.loc[DDsumDOB.borough=='Manhattan']
Queen = DDsumDOB.loc[DDsumDOB.borough=='Queen']
StatenIsland = DDsumDOB.loc[DDsumDOB.borough=='Staten Island']

Bronx2020 = Bronx.loc[Bronx.Y_DEntered=='2020']
Bronx2020 = Bronx2020.sort_values(["Complaint Category"],ascending = False)
Bronx2021 = DDsumDOB2.loc[DDsumDOB2.borough=='Bronx']
Bronx2021 = Bronx2021.sort_values(["Complaint Category"],ascending = False)

Brooklyn2020 = Brooklyn.loc[Brooklyn.Y_DEntered=='2020']
Brooklyn2021 = DDsumDOB2.loc[DDsumDOB2.borough=='Brooklyn']
Brooklyn2020 = Brooklyn2020.sort_values(["Complaint Category"],ascending = False)
Brooklyn2021 = Brooklyn2021.sort_values(["Complaint Category"],ascending = False)

Manhattan2020 = Manhattan.loc[Manhattan.Y_DEntered=='2020']
Manhattan2021 = DDsumDOB2.loc[DDsumDOB2.borough=='Manhattan']
Manhattan2020 = Manhattan2020.sort_values(["Complaint Category"],ascending = False)
Manhattan2021 = Manhattan2021.sort_values(["Complaint Category"],ascending = False)

Queen2020 = Queen.loc[Queen.Y_DEntered=='2020']
Queen2021 = DDsumDOB2.loc[DDsumDOB2.borough=='Queen']
Queen2020 = Queen2020.sort_values(["Complaint Category"],ascending = False)
Queen2021 = Queen2021.sort_values(["Complaint Category"],ascending = False)

StatenIsland2020 = StatenIsland.loc[StatenIsland.Y_DEntered=='2020']
StatenIsland2021 = DDsumDOB2.loc[DDsumDOB2.borough=='StatenIsland']
Queen2020 = Queen2020.sort_values(["Complaint Category"],ascending = False)
Queen2021 = Queen2021.sort_values(["Complaint Category"],ascending = False)


# Plot
fig, ax = plt.subplots(figsize=(50, 70))

plt.subplot(141)
plt.scatter( 'counter','Complaint Category', data=Bronx2020, s='counter',marker='o', alpha=0.6)
plt.scatter( 'counter','Complaint Category', data=Bronx2021, s='counter',marker='o', alpha=0.8)
plt.title("Bronx",fontsize=60)
plt.xlabel("Occurrence",fontsize=40)
plt.ylabel("Complaint Category",fontsize=40)
plt.grid(color='lightgrey', linestyle='dashed')
plt.legend(('2020', 'Since 2020'),
           loc='upper right', shadow=True,fontsize=40)
plt.xlim(-1, 4000)


plt.subplot(142)
plt.scatter( 'counter','Complaint Category', data=Brooklyn2020, s=300,marker='o', alpha=0.4)
plt.scatter( 'counter','Complaint Category',data=Brooklyn2021, s=500,marker='o', alpha=0.6)
plt.title("Brooklyn",fontsize=60)
plt.xlabel("Occurrence",fontsize=40)
plt.ylabel("Complaint Category",fontsize=40)
plt.grid(color='lightgrey', linestyle='dashed')
plt.legend(('2020', 'Since 2020'),
           loc='upper right', shadow=True,fontsize=40)
plt.xlim(-1, 4000)


plt.subplot(143)
plt.scatter( 'counter','Complaint Category', data=Manhattan2020,s=300, marker='o', alpha=0.4)
plt.scatter('counter','Complaint Category', data=Manhattan2021, s=500,marker='o', alpha=0.6)
plt.title("Manhattan",fontsize=60)
plt.xlabel("Occurrence",fontsize=40)
plt.ylabel("Complaint Category",fontsize=40)
plt.grid(color='lightgrey', linestyle='dashed')
plt.legend(('2020', 'Since 2020'),
           loc='upper right', shadow=True,fontsize=40)
plt.xlim(-1, 4000)


plt.subplot(144)
plt.scatter( 'counter','Complaint Category',data=Queen2020, s=300,marker='o', alpha=0.4)
plt.scatter('counter','Complaint Category', data=Queen2021, s=500,marker='o', alpha=0.6)
plt.title("Queen",fontsize=60)
plt.xlabel("Occurrence",fontsize=40)
plt.ylabel("Complaint Category",fontsize=40)
plt.grid(color='lightgrey', linestyle='dashed')
plt.legend(('2020', 'Since 2020'),
           loc='upper right', shadow=True,fontsize=40)
plt.xlim(-1, 4000)


plt.show()

3 'Complaint Type' & 'Occurrence'

From the graph, we can see that Permit - None (Building/PA/Demo etc.), Illegal Conversion, Construction Safety Compliance (CSC) Action, Elevator: Single Device on Property/No Alternate Service, and CSE:Sweep are top 5 complaint types in NYC. And we see a large-increased complaints of CSE:Sweep in 2021.

In [89]:
T_DOBCR['Complaint Number']=str(T_DOBCR['Complaint Number'])
sumDOB = T_DOBCR.groupby(["COMPLAINT CATEGORY DESCRIPTION","Y_DEntered"],as_index=False).sum('counter').sort_values(["counter"],ascending = False)
sumDOB_drop = sumDOB.loc[(sumDOB['counter']< 100)]
sumDOB_dd = sumDOB.drop(sumDOB_drop.index)
sumDOB_dd = sumDOB_dd.drop('ReactionTime',axis=1)
sumDOB_dd = sumDOB_dd.pivot(index='COMPLAINT CATEGORY DESCRIPTION',columns='Y_DEntered',values='counter')
sumDOB_dd.reset_index(drop=False, inplace=True)
sumDOB_dd = sumDOB_dd.sort_values(["2020"],ascending = False)
sumDOB_dd['2020'].fillna(0, inplace=True)
sumDOB_dd['2021'].fillna(0, inplace=True)
sumDOB_dd['total'] = (sumDOB_dd['2020'] + sumDOB_dd['2021'])

# Plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(6, 18))
sns.set_color_codes("pastel")

sns.barplot(
    x="total",
    y="COMPLAINT CATEGORY DESCRIPTION",
    data=sumDOB_dd,
    label = "2021",
    color="g",
    ax=ax,
    zorder=999,
)

sns.barplot(
    x="2020",
    y="COMPLAINT CATEGORY DESCRIPTION",
    data=sumDOB_dd,
    label = "2020",
    color="b",
    ax=ax,
    zorder=999,
)

sns.set_context(
    "poster",
    font_scale=1,
    rc={"lines.linewidth":2.5}
               ) 
ax.set_xlabel("Occurrence",fontsize=16)
ax.set_ylabel("Complaint Type",fontsize=16)
ax.grid(True, axis="y")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_title(
    "Relationship between 'Complaint Type' & 'Occurrence' in NYC since 2020", weight="bold", fontsize=16
)

ax.legend(ncol=1, loc="lower right", frameon=True,prop={'size':16});

4 'Borough' & 'Occurrence'

From the graph, we can see that compared to other counties, Brooklyn has the highest number of DOB complaints and even gets a higher increase rate in 2021.

In [90]:
DistDOB = T_DOBCR.groupby(["borough","Y_DEntered"],as_index=False).sum('counter').sort_values(["counter"],ascending = False)
DistDOB_drop = DistDOB.loc[(DistDOB['counter']< 100)]
DistDOB_dd = DistDOB.drop(DistDOB_drop.index)
DistDOB_dd = DistDOB_dd.drop('ReactionTime',axis=1)
DistDOB_dd = DistDOB_dd.pivot(index='borough',columns='Y_DEntered',values='counter')
DistDOB_dd.reset_index(drop=False, inplace=True)
DistDOB_dd = DistDOB_dd.sort_values(["2020"],ascending = False)
DistDOB_dd['2020'].fillna(0, inplace=True)
DistDOB_dd['2021'].fillna(0, inplace=True)
DistDOB_dd['total'] = (DistDOB_dd['2020'] + DistDOB_dd['2021'])

# Plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(6, 6))
sns.set_color_codes("pastel")

sns.barplot(
    x="borough",
    y="total",
    data=DistDOB_dd,
    label = "2021",
    color="g",
    ax=ax,
    zorder=999,
)

sns.barplot(
    x="borough",
    y="2020",
    data=DistDOB_dd,
    label = "2020",
    color="b",
    ax=ax,
    zorder=999,
)

sns.set_context(
    "poster",
    font_scale=0.5,
    rc={"lines.linewidth":2.5}
               ) 
ax.set_xlabel("Borough",fontsize=16)
ax.set_ylabel("Occurrence",fontsize=16)
ax.grid(True, axis="y")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_title(
    "Relationship between 'Borough' & 'Occurrence' in NYC since 2020", weight="bold", fontsize=16
)

ax.legend(ncol=1, loc="upper right", frameon=True,prop={'size':10});

5 Reaction Time distribution

From the graph, we can see that, surprisingly, most of the complaints are responsed within three days. Only few complaints are responsed after ten days.

In [91]:
RTimegroup_DOB = T_DOBCR.groupby(["ReactionTime"],as_index=False).sum('counter').sort_values(["ReactionTime"],ascending = False)
RTimegroup_DOB = RTimegroup_DOB.loc[(RTimegroup_DOB['counter']> 1000)]

# Plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(18, 6))
sns.set_color_codes("pastel")

sns.barplot(
    x="ReactionTime",
    y="counter",
    data=RTimegroup_DOB,
    color="b",
    ax=ax,
    zorder=999,
)

sns.set_context(
    "poster",
    font_scale=1,
    rc={"lines.linewidth":1}
               ) 
ax.set_xlabel("Reaction Time",fontsize=16)
ax.set_ylabel("Frequency",fontsize=16)
ax.grid(True, axis="y")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_title(
    "Reaction Time distribution in NYC since 2020", weight="bold", fontsize=16
)

ax.legend(ncol=1, loc="upper right", frameon=True,prop={'size':16});

6 Monthly Distribution and Reaction Time

From the graph, we can see that this diagram was generated for exploration. From this plot, we can know that there is not an obvious tendency of reaction time between different months, which was my bias when trying to do this plot.

In [92]:
month_T_DOBCR = T_DOBCR.groupby(['Month_DEntered','ReactionTime'],as_index=False).sum('counter')
month_T_DOBCR = month_T_DOBCR.loc[month_T_DOBCR['counter']>100]

# Plot
chart = alt.Chart(month_T_DOBCR).mark_circle(
    opacity=0.6,
    stroke='black',
    strokeWidth=1
).encode(
    alt.X('Month_DEntered:O',axis=alt.Axis(labelAngle=0)),
    alt.Y('ReactionTime:N',axis=alt.Axis(values=[1,11,21,31,41,51,61])),
    alt.Size('counter:Q',
        scale=alt.Scale(range=[0, 500]),
        legend=alt.Legend(title='Total Occurrence')
    ),
    tooltip=["Month_DEntered","ReactionTime","counter"],
    color = alt.value("#FF8542")
).properties(
    width=600,
    height=400,
    title = 'Monthly Distribution of Reaction Time'
)
chart.encoding.x.title = 'Month'
chart.encoding.y.title = 'Reaction Time / day'
chart

7 Time Distribution of Different Complaint Type

From the graph, we can see that. This huge plot was made to take a detailed look at the time distribution of different complaint type since 2020. Though the information may not be so abundat as it looks, but it still provides some information. Most of the complaints don't have a specific burst period but are evenly distributed during the year. However, '7G' complaint has a surge from 06/2020 - 07/2020. '2N' had a realtively densed distribution from 04/2020 - 06/2020. Also the complaints decreased between 04/2020 - 07/2020, which might be the cause of COVID-19.

In [93]:
dt_T_DOBCR = T_DOBCR.groupby(['Date Entered','COMPLAINT CATEGORY DESCRIPTION'],as_index=False).sum()
dt_T_DOBCR = dt_T_DOBCR.drop('ReactionTime',axis=1)
dt_T_DOBCR = dt_T_DOBCR.sort_values('counter')
CODE_dt_T_DOBCR = dt_T_DOBCR.merge(DOBCatalog,on = ['COMPLAINT CATEGORY DESCRIPTION'], how='left')

# Plot
alt.data_transformers.disable_max_rows()

chart2 = alt.Chart(CODE_dt_T_DOBCR).mark_circle(
    opacity=0.4,
    stroke='darkorange',
    strokeWidth=1
).encode(
    alt.X('Date Entered',axis=alt.Axis(labelAngle=0)),
    alt.Y('Complaint Category:O'),
    alt.Size('counter:Q',
        scale=alt.Scale(range=[0, 1000]),
        legend=alt.Legend(title='Total Occurrence')
    ),
    tooltip=["Date Entered","Complaint Category","COMPLAINT CATEGORY DESCRIPTION","counter"],
    color = alt.value("#F79D46")
).properties(
    width=1500,
    height=1200,
    title = 'Time Distribution of Different Complaint Type in NYC since 2020'
).configure_axis(
    labelFontSize=8,
    titleFontSize=10
).interactive()
chart2.encoding.x.title = 'Date Entered'
chart2.encoding.y.title = 'Complaint Category'

chart2

8 County Distribution of Different Complaint Type

From the graph, we can see that. Corresponding to the above plots, Brooklyn and Queen have relatively high numbers of DOB complaints. Besides, the pattern of complaint distribution does not have a county distribution tendency. The frequent complaints types, which are '03' - '05', '2N' - '45', '6M' - '74' and '83' - '91', remain nearly the same in these five counties.

In [94]:
B_T_DOBCR = T_DOBCR.groupby(['borough','COMPLAINT CATEGORY DESCRIPTION'],as_index=False).sum()
B_T_DOBCR = B_T_DOBCR.merge(DOBCatalog,on = ['COMPLAINT CATEGORY DESCRIPTION'], how='left')

alt.data_transformers.enable('json')
alt.data_transformers.disable_max_rows()
colormap = alt.Scale(
    domain=[0, 100, 200, 300, 1000, 5000],
    range=[
        "#F0F8FF",
        "cornflowerblue",
        "mediumseagreen",
        "#FFEE00",
        "darkorange",
        "firebrick",
    ],
    type="sqrt",
)

chart3 = alt.Chart(B_T_DOBCR).mark_rect().encode(
       y=alt.Y("borough:O", axis=alt.Axis(title=None, ticks=False), scale=alt.Scale(zero=False)),
       x=alt.X("Complaint Category", axis=alt.Axis(title=None, ticks=False,labelAngle=0), scale=alt.Scale(zero=False)),
       color=alt.Color("counter", sort="ascending", scale=colormap),
       tooltip=["borough", "COMPLAINT CATEGORY DESCRIPTION", "counter"],
   ).properties(
    width=1800,
    height=100,
    title = 'County Distribution of Different Complaint Type in NYC since 2020'
).configure_axis(
    labelFontSize=8,
    titleFontSize=10
)
chart3.encoding.x.title = 'Date Entered'
chart3.encoding.y.title = 'Complaint Category'
chart3

9 Reaction Time and Occurrence Frequency

Even though there is not an intuitive and overall conclusion at the first glimpse, this diagram gives an opportunity to detailedly look at the distribution pattern. For example, to the '03' complaint type, we can know that even though there is great variance in the total occurrence of five counties, the average reaction times of five counties are the same, concentrating at 5-10 days. And for '1X' complaint type, we can nearly see a reversed relationship between total occurrence and average reaction time. It seems that the more it occurs, the longer the reaction time will be, which requires us to take a further look to draw the conclusion.

In [95]:
counter_pyramid_T_DOBCR3 = T_DOBCR.groupby(['COMPLAINT CATEGORY DESCRIPTION','borough'],as_index=False).sum('counter')
counter_pyramid_T_DOBCR3 = counter_pyramid_T_DOBCR3.drop('ReactionTime',axis=1)
mean_pyramid_T_DOBCR3 = T_DOBCR.groupby(['COMPLAINT CATEGORY DESCRIPTION','borough'],as_index=False).mean('ReactionTime')
mean_pyramid_T_DOBCR3 = mean_pyramid_T_DOBCR3.drop('counter',axis=1)
c_pyramid_T_DOBCR3 = counter_pyramid_T_DOBCR3.merge(mean_pyramid_T_DOBCR3,on=['COMPLAINT CATEGORY DESCRIPTION','borough'], how='left')
c_pyramid_T_DOBCR3 = c_pyramid_T_DOBCR3.merge(DOBCatalog,on = ['COMPLAINT CATEGORY DESCRIPTION'], how='left')
c_pyramid_T_DOBCR3['ReactionTime'] = c_pyramid_T_DOBCR3['ReactionTime'].round(decimals = 1)


# Plot
brush = alt.selection(type='interval')
left = (
    alt.Chart(c_pyramid_T_DOBCR3)
    .mark_point(size=80)
    .encode(
        x=alt.X('Complaint Category:O',axis=alt.Axis( labels=False,ticks=False)),
        y=alt.Y('ReactionTime:O',axis=alt.Axis(),sort='ascending'),
        tooltip=["ReactionTime", "COMPLAINT CATEGORY DESCRIPTION", "counter"],
        color=alt.condition(brush,"borough:N" ,alt.value("lightgray"))
    ).properties( 
        selection=brush,
        width=1800, height=300)
)

middle = (
    alt.Chart(c_pyramid_T_DOBCR3)
    .mark_text()
    .encode(
        x=alt.X('Complaint Category:O',axis=None),
        text=alt.Text('Complaint Category:O'),
        color=alt.condition(brush,":O" ,alt.value("lightgray"))
).properties(width=1800, height=0)
)

right = (
    alt.Chart(c_pyramid_T_DOBCR3)
    .mark_point(size=80)
    .encode(
        x=alt.X('Complaint Category:O',axis=alt.Axis(ticks=False,labels=False)),
        y=alt.Y('counter:O',axis=alt.Axis(), sort='descending'),
        tooltip=["ReactionTime", "COMPLAINT CATEGORY DESCRIPTION",'Complaint Category:O', "counter"],
        color=alt.condition(brush,"borough:N" ,alt.value("lightgray"))
    ).properties( 
        selection=brush,
        width=1800, height=300)
)
right.encoding.y.title = 'Total Occurrence'
left.encoding.y.title = 'Average Reaction Time'
alt.vconcat(right,middle,left).configure_axis(
    labelFontSize=10,
    titleFontSize=10)

10 Dashboard

This dashboard tells us that '3A' - '4A' get large reaction time for five counties, which are longer than 55 days. In the fastest reaction time selction, Staten Island gets a largest number of occurrcence. In the longest reaction time selction, Queen county gets a largest number of occurrcence. In the mild reaction time selction, Brooklyn gets a largest number of occurrcence.

In [97]:
brush = alt.selection(type='interval')
up = (
    alt.Chart(c_pyramid_T_DOBCR3)
    .mark_point(size=40)
    .encode(
        x=alt.X('Complaint Category:O',axis=alt.Axis(labelAngle=0),scale=alt.Scale(zero=False)),
        y=alt.Y('ReactionTime:O',scale=alt.Scale(zero=False),sort='descending'),
        tooltip=["ReactionTime", "COMPLAINT CATEGORY DESCRIPTION", "counter"],
        color=alt.condition(brush,"borough:N" ,alt.value("lightgray"))
    ).properties( 
        selection=brush,
        width=700, height=300)
)

down = (
    alt.Chart(c_pyramid_T_DOBCR3)
    .mark_bar()
    .encode(
        x='count(counter):O',
        y='borough:N',
        color='borough:N',
    ).transform_filter(
        brush.ref() 
    ).properties(width=700)
)

chart4 = alt.vconcat(up,down)
chart4