Yelp Reviews Exploration

Xiong Zheng

1 Introduction

In this project, we'll explore restaurant review data available through the Yelp Dataset Challenge. The dataset includes Yelp data for user reviews and business information for 10 metropolitan areas.This project is broken into two parts:

Part 1: Testing how well sentiment analysis works
Part 2: Analyzing correlations between restaurant reviews and census data

In [39]:
import pandas as pd
import geopandas as gpd
import nltk
import string
import textblob
import seaborn as sns
import re
import contextily as cx
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import cenpy
import hvplot.pandas
import geoviews as gv
import geoviews.tile_sources as gvts
import holoviews as hv
from sklearn.linear_model import LinearRegression
from mpl_toolkits.axes_grid1 import make_axes_locatable
from holoviews import opts
from shapely.geometry import Polygon
nltk.download('stopwords')
hv.extension('bokeh', 'matplotlib');

2 Sentiment analysis

First, Format the review text, split the review text into its individual words and make all of the words lower-cased Second, to remove any stop words from the list of words in each review. Then, Calculate polarity and subjectivity Last, Comparing the sentiment analysis to number of stars

In [40]:
review_clv = pd.read_json("./data/reviews_cleveland.json.gz",orient="records",lines=True)
review_clv["formatted_text"] = review_clv["text"].str.lower()
review_clv["formatted_text"] = review_clv["formatted_text"].str.findall( r'\w+|[^\s\w]+')

stop_words = list(set(nltk.corpus.stopwords.words('english')))
punctuation = list(string.punctuation)
ignored = stop_words + punctuation

review_clv["formatted_text"] = [[i for i in b if i not in ignored] for b in review_clv["formatted_text"]]
for p in punctuation:
    review_clv["formatted_text"] = [[i for i in b if not i.startswith (p)] for b in review_clv["formatted_text"]]

blobs = [textblob.TextBlob(" ".join(b)) for b in review_clv["formatted_text"]]
review_clv['polarity'] = [blob.sentiment.polarity for blob in blobs]
review_clv['subjectivity'] = [blob.sentiment.subjectivity for blob in blobs]

# instead of making two plots, a side-by-side plot is made to display the comparison
figure, axes = plt.subplots(1, 2)
figure.suptitle("Polarity & Subjectivity Plots based on Review Stars",fontsize=25)
flierprops = dict(marker='+', markerfacecolor='black', markersize=1, markeredgecolor='black')
sns.set_style("whitegrid")
sns.set(rc={'axes.facecolor':'#F0F0F1'})

boxplot1=sns.boxplot(
    y='polarity', 
    x='stars', 
    linewidth=1,
    data=review_clv, 
    color="#5687BA",
    ax=axes[0],
    width=0.6,
    flierprops=flierprops)

boxplot2=sns.boxplot(
    y='subjectivity', 
    x='stars', 
    linewidth=1,
    data=review_clv, 
    color="#E23446",
    ax=axes[1],
    width=0.6,
    flierprops=flierprops)

boxplot1.set_xlabel("Stars", fontsize=14)
boxplot1.set_ylabel("Polarity", fontsize=14)
boxplot2.set_xlabel("Stars", fontsize=14)
boxplot2.set_ylabel("Subjectivity", fontsize=14)
axes[0].axhline(y=0, c='k', lw=2,linestyle='--') 
axes[1].axhline(y=0, c='k', lw=2,linestyle='--') 

for patch in boxplot1.artists:
 r, g, b, a = patch.get_facecolor()
 patch.set_facecolor((r, g, b, .6))
for patch in boxplot2.artists:
 r, g, b, a = patch.get_facecolor()
 patch.set_facecolor((r, g, b, .6))
for line in boxplot1.get_lines():
   line.set_color('black')
for line in boxplot2.get_lines():
   line.set_color('black')

plt.rcParams["figure.figsize"] = [16,10]
plt.show()
drawing

Question: What do your charts indicate for the effectiveness of our sentiment analysis?
Answer: The polarities rise as stars go up, however the subjectivities rise relatively slowly. This indicates the sentiment analysis is not accurate. Because polarities should corresponde with subjectivities, rising at the similar speed.

2.1 The importance of individual words

In this part, we'll explore the importance and frequency of individual words in Yelp reviews. We will identify the most common reviews and then plot the average polarity vs the user stars for the reviews where those words occur.

In [41]:
review_clv_1000 = review_clv.sample(n = 1000,random_state=1)

def reshape_data(review_subset):
    """
    Reshape the input dataframe of review data.
    """
    from pandas import Series, merge
    
    X = (review_subset['formatted_text']
         .apply(Series)
         .stack()
         .reset_index(level=1, drop=True)
         .to_frame('word'))
    
    
    R = review_subset[['polarity', 'stars', 'review_id']]
    
    return merge(R, X, left_index=True, right_index=True).reset_index(drop=True)

review_clv_1000_2 = review_clv_1000
review_clv_1000_2_reshape = reshape_data(review_clv_1000_2)

review_clv_1000_2_reshape_text =review_clv_1000_2_reshape
review_clv_1000_2_reshape_text['count'] = 1

size = review_clv_1000_2_reshape_text.groupby(["word"],as_index=False)['count'].sum()
avg_stars =  review_clv_1000_2_reshape_text.groupby(["word"],as_index=False)['stars'].mean()
avg_polarity =  review_clv_1000_2_reshape_text.groupby(["word"],as_index=False)['polarity'].mean()
summary = pd.merge(size, avg_stars, on="word")
summary = pd.merge(summary, avg_polarity, on="word")
filtered=summary.loc[(summary["count"]>=50)]
filtered
word count stars polarity
5 1 65 3.107692 0.194570
6 10 50 3.440000 0.181099
49 2 100 3.420000 0.196473
77 3 88 3.102273 0.185049
92 4 84 3.321429 0.171560
... ... ... ... ...
7602 went 145 3.379310 0.186026
7649 wine 52 4.192308 0.314430
7686 work 51 3.450980 0.244607
7698 worth 57 3.543860 0.236521
7700 would 281 3.071174 0.173317
In [42]:
mean_polarity=review_clv['polarity'].mean()
mean_stars=review_clv['stars'].mean()
print("\n","Mean Polarity",mean_polarity,"\n","Mean Stars",mean_stars)

# Plot Scatter
fig, ax = plt.subplots(figsize=(90,55))
size_scatter=12*filtered['count']
plt.scatter('polarity', 'stars', s=size_scatter,
            data=filtered,color="red", alpha=0.5,
           edgecolors="white", linewidth=2)
plt.xlim(0.05, 0.42)
plt.ylim(2, 4.5)
plt.grid(color='#A8A8A8', lw=5,linestyle='dashed')

# Plot Text
for a in range(len(filtered)):
    ax.text(x=filtered.iloc[a,3]+0.001,y=filtered.iloc[a,2],s=filtered.iloc[a,0])

# Setting
ax.axvline(x=mean_polarity, c='k', lw=10,linestyle='dashed')
ax.axhline(y=mean_stars, c='k', lw=10,linestyle='dashed')
ax.set_xlabel("Polarity", fontsize=75)
ax.set_ylabel("Stars", fontsize=75);
ax.set_title("Polarity as a function of Reviews' stars", fontsize=100);
ax.tick_params(axis='both', which='major', labelsize=65)
ax.spines["left"].set_color("black")
ax.spines["bottom"].set_color("black")

# Regression Line
linear_regressor = LinearRegression()
polarity=filtered['polarity'].values
stars=filtered['stars'].values
polarity=polarity.reshape(-1,1)
linear_regressor = LinearRegression()
linear_regressor.fit(np.log(polarity), stars)
x_pred = np.log(np.linspace(0.05, 0.42, num=200).reshape(-1, 1))
y_pred = linear_regressor.predict(x_pred)
ax.plot(np.exp(x_pred), y_pred, color="#696969", lw=10)


plt.rcParams['axes.facecolor'] = 'white'
plt.show()

 Mean Polarity: 0.25948875457930276 
 Mean Stars: 3.7636730794810096
 
drawing

3 Correlating restaurant data and household income

In this part, we'll use the census API to download household income data and overlay restaurant locations.

In [43]:
acs = cenpy.remote.APIConnection("ACSDT5Y2018")

clv_MedHHInc_tract = acs.query(
    cols=["NAME", "B19013_001E"],
    geo_unit="tract:*",
    geo_filter={
                "state" : "39", 
                "county" : "035"
               },
).rename(columns={"B19013_001E": "MedHHInc"}, errors="raise")

acs.set_mapservice("tigerWMS_ACS2018")
where_clause = "STATE = 39 AND COUNTY = 035"
clv_tracts = acs.mapservice.layers[8].query(where=where_clause)

clv_MMedHHInc_M = clv_tracts.merge(
    clv_MedHHInc_tract,
    left_on=["STATE", "COUNTY", "TRACT"],
    right_on=["state", "county", "tract"],
).loc[:, ['geometry', 'NAME_y', 'MedHHInc','state','county','tract']].to_crs(epsg=32617)

clv_MMedHHInc_M=clv_MMedHHInc_M.rename(columns={"NAME_y": "NAME"}, errors="raise")

clv_MMedHHInc_M['MedHHInc'] = clv_MMedHHInc_M['MedHHInc'].astype(float).round()

clv_MMedHHInc_M = clv_MMedHHInc_M[clv_MMedHHInc_M['MedHHInc']>0]

fig, ax = plt.subplots(figsize=(20,20))
clv_MMedHHInc_M.plot(
    ax=ax, 
    column='MedHHInc',
    legend=True,
    cmap='viridis',
    scheme='quantiles', 
    alpha=0.4, 
    edgecolor='k'
)

cx.add_basemap(ax,zoom=12, crs=clv_MMedHHInc_M.crs, source=cx.providers.OpenStreetMap.Mapnik)
ax.set_axis_off()
ax.set_title("Map of Median Household Income, Cleveland", fontsize=25);
drawing

3.2 Overlay the restaurants data

In this section, we will overlay the restaurants and color the points according to the 'stars' column.

In [44]:
clv_res=pd.read_json("./data/restaurants_cleveland.json.gz",orient="records",lines=True)

clv_res['geometry'] = gpd.points_from_xy(clv_res['longitude'], clv_res['latitude'])
clv_res = gpd.GeoDataFrame(clv_res, geometry='geometry', crs="EPSG:4326")
clv_res=clv_res.to_crs(epsg=32617)

Income = clv_MMedHHInc_M.hvplot(c='MedHHInc',
                      frame_width=780, 
                      frame_height=600,
                      crs=32617,
                      cmap='viridis',
                      alpha=0.7,
                      dynamic=False)

Restaurant = clv_res.hvplot(
                      frame_width=780, 
                      frame_height=600,  
                      crs=32617, 
                      c="stars",
                      hover_cols=['name','stars'],
                      alpha=0.9,
                      dynamic=False).options(cmap=["#FFEE00","#F55368"])

combination3 = gvts.Wikipedia * Income * Restaurant

combination3.opts(
    opts.WMTS(width=780, height=600, xaxis=None, yaxis=None),
    opts.Overlay(title="Map of Restaurant and Median Household Income"))

3.3 Comparing polarity vs. stars geographically

In [45]:
review_clv_2 = review_clv.loc[:,["business_id","polarity"]]
mergedf = pd.merge(clv_res, review_clv_2, on="business_id")
In [46]:
# Set canvas
fig, axs = plt.subplots(ncols=2,figsize=(20,10))
ax1=axs[0]
ax2=axs[1]

# hexbin coordinate
xcoords = mergedf.geometry.x
ycoords = mergedf.geometry.y
polarity = mergedf.polarity

# Tract plot
clv_MMedHHInc_M.plot(ax=ax1,facecolor="none", edgecolor="black", linewidth=0.25)
clv_MMedHHInc_M.plot(ax=ax2,facecolor="none", edgecolor="black", linewidth=0.25)

# Hexbin plot
hex_vals1 = ax1.hexbin(
    xcoords, 
    ycoords, 
    gridsize=30,
    C=mergedf.polarity,
    reduce_C_function=np.median)

hex_vals2 = ax2.hexbin(
    xcoords, 
    ycoords, 
    gridsize=30,
    C=mergedf.stars,
    reduce_C_function=np.median)

ax1.set_title("Map of Review Polarities, Celveland", fontsize=18);
ax2.set_title("Map of Restaurant, Celveland", fontsize=18);

ax1.set_axis_off()
ax2.set_axis_off()

# Color bar
divider1 = make_axes_locatable(ax1)
cax1 = divider1.append_axes('right', size='5%', pad=0.05)
colorbar_polarity=fig.colorbar(hex_vals1, cax=cax1, orientation='vertical')

divider2 = make_axes_locatable(ax2)
cax2 = divider2.append_axes('right', size='5%', pad=0.05)
colorbar_stars=fig.colorbar(hex_vals2, cax=cax2, orientation='vertical')

colorbar_polarity.ax.set_title('Polarity')
colorbar_stars.ax.set_title('Star')

# Basemap plot
cx.add_basemap(ax1,zoom=13, crs=clv_MMedHHInc_M.crs, source=cx.providers.OpenStreetMap.Mapnik)
cx.add_basemap(ax2,zoom=13, crs=clv_MMedHHInc_M.crs, source=cx.providers.OpenStreetMap.Mapnik)
drawing