Craigslist Web Scrapping

Xiong Zheng

In this project, we'll be extracting information on apartments from Craigslist search results, from the HTML text.

Information:
The apartment title
The apartment price
The datetime string of the posting, e.g., '2019-03-23 12:07'

The number of bedrooms and square footage (this will be in a single element)

1 First Research Result

In [1]:
import requests

url_base = 'http://philadelphia.craigslist.org/search/apa'
params = {'bedrooms': 1}
rsp = requests.get(url_base, params=params)

from bs4 import BeautifulSoup

Clists = BeautifulSoup(rsp.content, 'html.parser')
SResults = Clists.select("#search-results li")

first_element = SResults[0]
print(first_element.prettify())
<li class="result-row" data-pid="7427249480">
 <a class="result-image gallery" data-ids="3:00101_k0RKftiS80cz_0CI0lT,3:00j0j_hU33zCTCslYz_0CI0oc,3:00O0O_21VjxpTJt0fz_0CI0lP" href="https://philadelphia.craigslist.org/apa/d/philadelphia-new-year-new-apartment/7427249480.html">
  <span class="result-price">
   $1,375
  </span>
 </a>
 <div class="result-info">
  <span class="icon icon-star" role="button">
   <span class="screen-reader-text">
    favorite this post
   </span>
  </span>
  <time class="result-date" datetime="2022-01-04 00:53" title="Tue 04 Jan 12:53:00 AM">
   Jan  4
  </time>
  <h3 class="result-heading">
   <a class="result-title hdrlnk" data-id="7427249480" href="https://philadelphia.craigslist.org/apa/d/philadelphia-new-year-new-apartment/7427249480.html" id="postid_7427249480">
    New Year New Apartment! 2 bedroom bi-level apartment close to all!
   </a>
  </h3>
  <span class="result-meta">
   <span class="result-price">
    $1,375
   </span>
   <span class="housing">
    2br -
                    900ft
    <sup>
     2
    </sup>
    -
   </span>
   <span class="result-hood">
    (Art Museum  )
   </span>
   <span class="result-tags">
    <span class="pictag">
     pic
    </span>
   </span>
   <span class="banish icon icon-trash" role="button">
    <span class="screen-reader-text">
     hide this posting
    </span>
   </span>
   <span aria-hidden="true" class="unbanish icon icon-trash red" role="button">
   </span>
   <a class="restore-link" href="#">
    <span class="restore-narrow-text">
     restore
    </span>
    <span class="restore-wide-text">
     restore this posting
    </span>
   </a>
  </span>
 </div>
</li>

In [10]:
Price = first_element.select_one("div > span.result-meta > span.result-price").text
NbrBed = first_element.select_one("div > span.result-meta > span.housing").text
Title = first_element.select_one("div > h3> a.result-title.hdrlnk").text
Time = first_element.find("time")['datetime']

print("For the first research result",
      "\n",
      "The apartment price:",Price,
     "\n",
     "The number of bedrooms and square footage:",NbrBed,
     "\n",
      "The apartment title:",Title,
     "\n",
     "The Datatime:",Time)
For the first research result
The apartment price: $1,375 The number of bedrooms and square footage: ;900ft2 The Datatime: 00:53 The apartment title: Year New Apartment! 2 bedroom bi-level apartment close to all!

2 Format all research results

In this section, functions that take the size, bedrooms, price and time results from the last section and format them properly will be created.

In [11]:
import re
import numpy as np
import datetime

def format_size_and_bedrooms(size_string):
    """
    Extract size and number of bedrooms from the raw
    text, using regular expressions
    """
    split = re.findall("\n(.*?) -", size_string)
    
    # both size and bedrooms are listed
    if len(split) == 2:
        n_brs = split[0].strip().replace('br', '')
        this_size = split[1].strip().replace('ft2', '')
    # only bedrooms is listed
    elif 'br' in split[0]:
        n_brs = split[0].strip().replace('br', '')
        this_size = np.nan
    # only size is listed
    elif 'ft2' in split[0]:
        # It's the size
        this_size = split[0].strip().replace('ft2', '')
        n_brs = np.nan
    
    # return floats
    return float(this_size), float(n_brs)

def format_price(price_string):
    Medium = price_string
    Medium = Medium.replace("$","")
    Medium = Medium.replace(",","")
    price_string = Medium
    return float(price_string)

def format_time(date_string):
    match = re.findall('\d+',date_string)
    year = int(match[0])
    month = int(match[1])
    day = int(match[2])
    hour = int(match[3])
    minute = int(match[4])
    time = datetime.datetime(year,month,day,hour,minute)
    return time
In [13]:
from time import sleep
import pandas as pd

results = []
max_pages = 5
results_per_page = 120
search_indices = np.arange(0, max_pages*results_per_page, results_per_page) 
url = 'http://philadelphia.craigslist.org/search/apa'
for i, s in enumerate(search_indices):
    resp = requests.get(url, params={'bedrooms': 1, 's': s})
    apts = BeautifulSoup(resp.content, 'html.parser').select("#search-results li")
    page_results = []
    for apt in apts:
        Element = apt
        sizes_brs = Element.select_one("div > span.result-meta > span.housing").text
        title = Element.select_one("div > h3> a.result-title.hdrlnk").text
        price = Element.select_one("div > span.result-meta > span.result-price").text
        dtime = Element.find("time")['datetime']
        sizes, brs = format_size_and_bedrooms(sizes_brs)
        price = format_price(price)
        dtime = format_time(dtime)
        page_results.append([dtime, price, sizes, brs, title])
    col_names = ['time', 'price', 'size', 'brs', 'title']
    df = pd.DataFrame(page_results, columns=col_names)
    results.append(df)
    sleep(30)
results = pd.concat(results, axis=0)

results = results.reset_index(drop=True)
results.head()
time price size brs title
0 2022-01-04 00:53:00 1375.0 900.0 2.0 New Year New Apartment! 2 bedroom bi-level apa...
1 2022-01-04 00:15:00 1925.0 1100.0 2.0 ONE-OF-A-KIND RESTORED VICTORIAN! PET FRIENDLY
2 2022-01-03 23:59:00 2870.0 2002.0 3.0 Gas fireplaces, Tennis, Hardwood laminate floors
3 2022-01-03 22:53:00 1475.0 1100.0 2.0 DELIGHTFUL, SPACIOUS VICTORIAN! Pets Welcome
4 2022-01-03 22:45:00 1400.0 1100.0 2.0 819 wharton st

3 Plotting the distribution of prices

In [14]:
import matplotlib.pyplot as plt
import statistics as st

# plot
fig, ax = plt.subplots(figsize=(9,6))
mean_price = results['price'].mean()
mean_price
ax.hist(results['price'],
        bins = 50,
        density=False,
        edgecolor="#e5989b",
        linewidth=2,
        color='white');

ax.grid(color='#BABABA', lw=0.5,linestyle=':')
ax.set_axisbelow(True)
ax.axvline(x=mean_price, c='#CC3E43', lw=2,linestyle='dashed')
ax.set_xlabel("Price", fontsize=15)
ax.set_ylabel("Frequency", fontsize=15);
ax.set_title("Price Distribution", fontsize=20);
for key, spine in ax.spines.items():
    spine.set_visible(False)
In [15]:
# Apartment prices per square foot (price / size)
results['price'] = results['price'].fillna(results['price'].mean())
results['size'] = results['size'].fillna(results['size'].mean())
results['price/sf'] = results['price']/results['size']
mode = st.mode(results['price/sf'])
mode = float("{:.2f}".format(mode))
fig, ax = plt.subplots(figsize=(9,6))
ax.hist(results['price/sf'],
        bins = 50,
        density=False,
        edgecolor="#e5989b",
        linewidth=2,
        color='white');
ax.grid(color='#BABABA', lw=0.5,linestyle=':')
plt.ylim(0, 90)
ax.set_axisbelow(True)
ax.axvline(x=mode, c='#CC3E43', lw=2,linestyle='dashed')
ax.set_xlabel("price/sf", fontsize=15)
ax.set_ylabel("Frequency", fontsize=15)
ax.text(mode+0.2, 80, 'Mode = %s'%round(mode, 2),color='#CC3E43')
ax.set_title("Price per sq.ft Distribution", fontsize=20);
for key, spine in ax.spines.items():
    spine.set_visible(False)

4 Comparing prices for different sizes

In [19]:
import altair as alt

alt.data_transformers.disable_max_rows()
colormap = alt.Scale(
    range=["#ffd700",
           "#ffb14e",
           "#fa8775",
           "#ea5f94",
           "#cd34b5",
           "#9d02d7",
           "#e5989b"])

chart = alt.Chart(results).mark_circle(
   opacity=0.75,
   strokeWidth=20).encode(
    alt.X('price:Q'),
    alt.Y('size:Q'),
    alt.Color('brs:N',
    scale=colormap,
    legend=alt.Legend(title='Num of Bedrooms')),
    tooltip= ['time','price','size','brs']).properties(
    width=400,
    height=200,
    title = 'Prices for different sizes').configure_legend(
    titleFontSize=10,
    labelFontSize=8) .configure_axis(
    labelFontSize=8,
    titleFontSize=10).interactive()
chart.encoding.x.title = 'Price (US Dollars)'
chart.encoding.y.title = 'Size (Sq.Ft)'
    
chart