Skip to content Skip to sidebar Skip to footer

Beautifulsoup: When Row Not Present, NaN Else Value

This code gets data from www.oddsportal.com How can I accomodate for when there is no score present for any event in this code? Currently, the code scrapes all data from the pages:

Solution 1:

You need to first:

from numpy import nan

And then modify code as follows:

        ...
        # Score present?
        if ':' not in row[2]:
            # No, shift a few columns right:
            row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
        game_data.score.append(row[2])
        game_data.home_odds.append(nan if row[3] == '-' else row[3])
        game_data.draw_odds.append(nan if row[4] == '-' else row[4])
        game_data.away_odds.append(nan if row[5] == '-' else row[5])
        ...

Note that generate_matches has to be modified to return list instances rather than tuple instances since the above code now requires that the return values, i.e. row, be modifiable.

Putting it all together:

import pandas as pd
from numpy import nan
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool, Pool
from functools import partial
import os
import re

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up
        # print('The driver has been "quitted".')


threadLocal = threading.local()


def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver


class GameData:
    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []


def generate_matches(table):
    tr_tags = table.findAll('tr')
    for tr_tag in tr_tags:
        if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
            th_tag = tr_tag.find('th', {'class': 'first2 tl'})
            a_tags = th_tag.findAll('a')
            country = a_tags[0].text
            league = a_tags[1].text
        else:
            td_tags = tr_tag.findAll('td')
            yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
                  td_tags[4].text, td_tags[5].text, country, league]


def parse_data(process_pool, url, return_urls=False):
    browser = create_driver()
    browser.get(url)
    # Wait for initial content to be dynamically updated with scores:
    browser.implicitly_wait(5)
    table = browser.find_element_by_xpath('//*[@id="table-matches"]/table')
    # If you do not pass a Pool instance to this function to use
    # multiprocessing for the more CPU-intensive work,
    # then just replace next statement with: return process_page(browser.page_source, return_urls)
    return process_pool.apply(process_page, args=(browser.page_source, return_urls))

def process_page(page_source, return_urls):
    soup = bs(page_source, "lxml")
    div = soup.find('div', {'id': 'table-matches'})
    table = div.find('table', {'class': 'table-main'})
    h1 = soup.find('h1').text
    print(h1)
    m = re.search(r'\d+ \w+ \d{4}$', h1)
    game_date = m[0]
    game_data = GameData()
    for row in generate_matches(table):
        game_data.date.append(game_date)
        game_data.time.append(row[0])
        game_data.game.append(row[1])
        # Score present?
        if ':' not in row[2]:
            # No, shift a few columns right:
            row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
        game_data.score.append(row[2])
        game_data.home_odds.append(nan if row[3] == '-' else row[3])
        game_data.draw_odds.append(nan if row[4] == '-' else row[4])
        game_data.away_odds.append(nan if row[5] == '-' else row[5])
        game_data.country.append(row[6])
        game_data.league.append(row[7])

    if return_urls:
        span = soup.find('span', {'class': 'next-games-date'})
        a_tags = span.findAll('a')
        urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
        return game_data, urls
    return game_data


if __name__ == '__main__':
    results = None

    pool = ThreadPool(3) # This seems to be optimal for this application
    # Create multiprocessing pool to do the CPU-intensive processing:
    process_pool = Pool(min(5, os.cpu_count())) # 5 seems to be optimal for this application
    # Get today's data and the Urls for the other days:
    game_data_today, urls = pool.apply(parse_data, args=(process_pool, 'https://www.oddsportal.com/matches/soccer', True))
    urls.pop(1)  # Remove url for today: We already have the data for that
    game_data_results = pool.imap(partial(parse_data, process_pool), urls)
    for i in range(8):
        game_data = game_data_today if i == 1 else next(game_data_results)
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    print(results)
    # print(results.head())
    # ensure all the drivers are "quitted":
    del threadLocal

Prints:

Next Soccer Matches: Today, 10 Sep 2021
Next Soccer Matches: Tuesday, 14 Sep 2021
Next Soccer Matches: Wednesday, 15 Sep 2021
Next Soccer Matches: Thursday, 16 Sep 2021
Next Soccer Matches: Yesterday, 09 Sep 2021
Next Soccer Matches: Sunday, 12 Sep 2021
Next Soccer Matches: Monday, 13 Sep 2021
Next Soccer Matches: Tomorrow, 11 Sep 2021
             date   time                              game     score home_odds draw_odds away_odds     country                league
0     09 Sep 2021  00:00            Cumbaya - Guayaquil SC       1:0      -169      +263      +462     Ecuador               Serie B
1     09 Sep 2021  00:00            FC Tulsa - Indy Eleven       2:1      -104      +265      +237         USA      USL Championship
2     09 Sep 2021  00:05           Pumas Tabasco - Atlante       0:2      +221      +186      +134      Mexico  Liga de Expansion MX
3     09 Sep 2021  00:05                   Panama - Mexico       1:1      +518      +250      -156       World        World Cup 2022
4     09 Sep 2021  00:10        Defensa y Justicia - Tigre  0:1 pen.      +138      +199      +214   Argentina        Copa Argentina
...           ...    ...                               ...       ...       ...       ...       ...         ...                   ...
1987  16 Sep 2021  19:00      Olympiacos Piraeus - Antwerp       NaN      -137      +296      +371      Europe         Europa League
1988  16 Sep 2021  19:15               Academica - Estrela       NaN      -106      +231      +290    Portugal       Liga Portugal 2
1989  16 Sep 2021  21:00               Barnechea - Rangers       NaN      +202      +202      +127       Chile             Primera B
1990  16 Sep 2021  22:00  San Marcos de Arica - S. Morning       NaN      +212      +214      +122       Chile             Primera B
1991  16 Sep 2021  23:30       U. De Concepcion - Coquimbo       NaN      +158      +198      +162       Chile             Primera B

[1992 rows x 9 columns]

Post a Comment for "Beautifulsoup: When Row Not Present, NaN Else Value"