## API

https://fred.stlouisfed.org/docs/api/fred/

In [2]:
import requests

import pandas as pd


FRED_URL = "https://api.stlouisfed.org/fred"


def fred_series_request(series):
    key = "d4f97d5305ace8b756e361252fbf02cd"
    url = (
        FRED_URL +
        "/series/observations" +
        f"?series_id={series}&file_type=json&api_key={key}"
    )

    return requests.get(url).json()


In [5]:
data = fred_series_request("UNRATE")
df = pd.DataFrame(data["observations"])
df.to_csv("UNRATE.csv")


In [6]:
df

Unnamed: 0,realtime_start,realtime_end,date,value
0,2021-03-30,2021-03-30,1948-01-01,3.4
1,2021-03-30,2021-03-30,1948-02-01,3.8
2,2021-03-30,2021-03-30,1948-03-01,4.0
3,2021-03-30,2021-03-30,1948-04-01,3.9
4,2021-03-30,2021-03-30,1948-05-01,3.5
...,...,...,...,...
873,2021-03-30,2021-03-30,2020-10-01,6.9
874,2021-03-30,2021-03-30,2020-11-01,6.7
875,2021-03-30,2021-03-30,2020-12-01,6.7
876,2021-03-30,2021-03-30,2021-01-01,6.3


## `pd.read_html`

https://www.espn.com/soccer/schedule/_/league/ger.1

In [10]:
import pandas as pd


def retrieve_soccer_schedule():
    url = "https://www.espn.com/soccer/schedule/_/league/ger.1"

    # Returns a list of tables -- We know that the first one
    # is the one we want
    return pd.read_html(url)


In [14]:
tables = pd.read_html("https://www.espn.com/soccer/team/stats/_/id/125")

In [16]:
tables[1]

Unnamed: 0,RK,Name,P,A
0,1.0,Filip Kostic,22,10
1,2.0,Daichi Kamada,24,9
2,3.0,André Silva,24,3
3,,Aymen Barkok,21,3
4,5.0,Djibril Sow,21,2
5,,Amin Younes,19,2
6,,Steven Zuber,14,2
7,,Bas Dost,12,2
8,9.0,Martin Hinteregger,24,1
9,,Obite Ndicka,16,1


In [11]:
df = retrieve_soccer_schedule()

In [12]:
df

[                           match                Unnamed: 1  time  tv  \
 0          Bayer Leverkusen LEVv           Schalke 04 SCHA   NaN NaN   
 1         Borussia Dortmund DORv  Eintracht Frankfurt EINF   NaN NaN   
 2               FC Augsburg AUGv        TSG Hoffenheim HOF   NaN NaN   
 3                    Mainz MAINv     Arminia Bielefeld DSC   NaN NaN   
 4            VfL Wolfsburg WOLFv            FC Cologne COL   NaN NaN   
 5                RB Leipzig LEIv         Bayern Munich MUN   NaN NaN   
 6  Borussia Monchengladbach MONv          SC Freiburg FREI   NaN NaN   
 
    Unnamed: 4  
 0         NaN  
 1         NaN  
 2         NaN  
 3         NaN  
 4         NaN  
 5         NaN  
 6         NaN  ,
                    match          Unnamed: 1  time  tv  Unnamed: 4
 0     VfB Stuttgart STUv   Werder Bremen WER   NaN NaN         NaN
 1  FC Union Berlin FCUBv  Hertha Berlin HERT   NaN NaN         NaN,
                     match        Unnamed: 1  time  tv  Unnamed: 4
 0  A

In [None]:
df.to_csv("soccer_schedule.csv")

## Request intercept

https://covid.cdc.gov/covid-data-tracker/#vaccinations

In [17]:
import requests

import pandas as pd


def retrieve_vaccination_data():
    url = "https://covid.cdc.gov/covid-data-tracker/COVIDData/getAjaxData?id=vaccination_data"

    data = requests.get(url).json()

    return data


def parse_vaccine_json_to_df(data):
    df = pd.DataFrame(data["vaccination_data"])

    return df


In [18]:
data = retrieve_vaccination_data()

In [27]:
data["vaccination_data"]

[{'Date': '2021-03-29',
  'Location': 'US',
  'ShortName': 'USA',
  'LongName': 'United States',
  'Census2019': 331996199,
  'date_type': 'Report',
  'Doses_Distributed': 180646565,
  'Doses_Administered': 145812835,
  'Dist_Per_100K': 54412,
  'Admin_Per_100K': 43920,
  'Administered_Moderna': 68177850,
  'Administered_Pfizer': 74338025,
  'Administered_Janssen': 3155418,
  'Administered_Unk_Manuf': 141542,
  'Administered_Dose1_Recip': 95015762,
  'Administered_Dose1_Pop_Pct': 28.6,
  'Administered_Dose2_Pop_Pct': 14.9,
  'Administered_Dose1_Recip_18Plus': 94686188,
  'Administered_Dose1_Recip_18PlusPop_Pct': 36.7,
  'Administered_18Plus': 145400967,
  'Admin_Per_100k_18Plus': 56328,
  'Distributed_Per_100k_18Plus': 69983,
  'Administered_Dose1_Recip_65Plus': 39799162,
  'Administered_Dose1_Recip_65PlusPop_Pct': 72.8,
  'Administered_65Plus': 66457122,
  'Admin_Per_100k_65Plus': 121501,
  'Distributed_Per_100k_65Plus': 330268,
  'Administered_Dose2_Recip': 49480294,
  'Administered_

In [28]:
data = retrieve_vaccination_data()
df = parse_vaccine_json_to_df(data)
df.to_csv("vaccination_data.csv")


## Parsing page

https://magicseaweed.com/Baltrum-Surf-Report/1117/

In [44]:
import requests

from bs4 import BeautifulSoup


def retrieve_webpage_soup():
#     html = requests.get("https://magicseaweed.com/Baltrum-Surf-Report/1117/").text
    html = requests.get("https://magicseaweed.com/St-Peter-Ording-Surf-Report/157/").text
    soup = BeautifulSoup(html)

    return soup

In [31]:
soup = retrieve_webpage_soup()

In [33]:
soup.find_all("h1")

[<h1 class="nomargin page-title">  Baltrum Surf Report and Forecast  </h1>]

In [38]:
soup.find_all(
    "li", attrs={"class": "list-group-item"}
)[0].find(
    "div", attrs={"class": "list-group-title"}
).text.strip()

'Primary Swell'

In [40]:
soup.find_all(
    "li", attrs={"class": "list-group-item"}
)[0].find(
    "div", attrs={"class": "list-group-content"}
).text.strip()

'3ft at 6s'

In [45]:
def extract_swell(soup):
    list_group_items = soup.find_all("li", attrs={"class": "list-group-item"})

    rows = []
    for item in list_group_items[:2]:
        out = {}
        out["variable"] = item.find("div", attrs={"class": "list-group-title"}).text.strip()
        out["value"] = item.find("div", attrs={"class": "list-group-content"}).text.strip()
        rows.append(out)

    return pd.DataFrame(rows)

In [46]:
soup = retrieve_webpage_soup()
df = extract_swell(soup)

df.to_csv("swell.csv")


In [47]:
df

Unnamed: 0,variable,value
0,Primary Swell,3.5ft at 6s
1,Secondary Swell,0.4ft at 14s


**Downloading files**

In [1]:
import requests

In [2]:
res = requests.get(
    "http://www.africau.edu/images/default/sample.pdf"
)

In [52]:
with open("test.pdf", "wb") as f:
    f.write(res.content)


## Read content from pdf

In [54]:
import camelot

In [None]:
# Bad because camelot reads tables rather than just text
camelot.read_pdf("test.pdf")  

https://pypi.org/project/PyMuPDF/

In [1]:
import fitz
# fitz.read_pdf

In [3]:
doc = fitz.open("test.pdf")

In [5]:
doc.get_page_text(0)

' A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\n'