6 Python

6.1 Basic

6.1.1 Regular Expressions

Regular expressions
^	Matches the beginning of a line
$	Matches the end of a line
.	Matches any character
backslash s	Matches whitespace
backslash S	Matches any non-whitespace character
*	Repeats a character zero or more times
*?	Repeats a character zero or more times (non-greedy)
+	Repeats a character one or more times
+?	Repeats a character one or more times (non-greedy)
[aeiou]	Matches a single character in the listed set
[^YXZ]	Matches a single character not in the listed set
[a-z0-9]	The set of characters can include a range
(	Indicates where string extraction is to start
)	Indicates where string extraction is to end
/th>	Matches word boundary
/th>	any digit
	any non-digit
	any whitespace
	any non-whitespace
	any alphanumerical character
	any non-alphanumerical character
{n}	Exactly n repetitions
{n,}	At least n repetitions
{,n}	At most n repetitions
{m,n}	At least m and at most n repetitions

6.1.1.1 Regex working with dates

###Regular expression for dates
#date variations are many
dateStr  = '23-10-2002\n23/10/2002\n23/10/02\n10/23/2002\nOct 23, 2002\nOctober 23, 2002\n23 October 2002'

print(re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', dateStr)) ###matches any of first 1-2 digits, then either -/ then 1-2 digits, then either, then two digits
print(re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', dateStr))  ##at least 2 and max 4

print(re.findall(r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{2,4}', dateStr)) ###Using ? says that it should not just give the information in the ()
#starts with the characters but can have more characters after1
print(re.findall(r'(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug| Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{2,4}', dateStr)) #the above says it may have information at the start or at the expanded

6.1.2 Create temp table Pandas

import pandas as pd

data = {'month': ['January','February','March','April','May','June','July','August','September','October','November','December'],
        'days_in_month': [31,28,31,30,31,30,31,31,30,31,30,31]
        }

df = pd.DataFrame(data, columns = ['month', 'days_in_month'])
print (df)

6.1.3 Working with Text and Strings

#Working with Text

text1 = "Since February 2015, the Riksbank has purchased government bonds with the aim of making monetary policy more expansionary."

len(text1) #get number of characters

text2 = text1.split(' ') #split into words
len(text2) #get number of words
print(text2)

#finding specific words_fomc
print([w for w in text2 if len(w) > 3]) #get all words longer then 3 characters

#capitalized words
print([w for w in text2 if w.istitle()]) #use istitle to get words with capitalized first character

#words that end with s
print([w for w in text2 if w.endswith('s')]) #words that end with s


#Finding unique words
text3 = 'To be or not to be'
text4 = text3.split(' ')
print(len(text4))

print(set(text4)) ##finding unique words with set (but caps sensitive)

print(len(set([w.lower() for w in text4]))) ##only get lower
print(set([w.lower() for w in text4]))
#minut 4.

##check what it is
#s.startwidth(t)
#s.endswith(t)
#t in s #if a substring is in a bigger
#s.isupper(); s.islower(); s.istitle()
#s.isalpha(); s.isdigit(); s.isalnum() #if string is alphabetical, numerical, or both

##string operations (change the string)
#s.lower(); s.upper(); s.titlecase()
#s.split(t) #split depending on t
#s.splitlines() #split sentence line on new line character or end of line character
#s.join(t) #takes word t and join with s
#s.strip(); s.rstrip() ##strip, take out for example space. strip from front, rstrip from endswith
#s.find(t); s.rfind(t)
#s.replace(u,v)

###from words to characters
text5 = 'ouagadougou'
text6 = text5.split('ou') #split on ou
print(text6)
text6 = 'ou'.join(text6) #now you get back the full. split and join opposite.
print(text6)

#get all of the characters, do it as a list
print(list(text5))
print([c for c in text5])

###Cleaning text
text8 = '    A quick brown fox jumped over the lazy dog. '

text9 = text8.strip() #strips out white characters
print(text9.split(' ')) #split on space

###Changing text1
#find and replace
text9 = '    A quick brown fox jumped over the lazy dog. '
print(text9.find('o')) #starts from start

print(text9.rfind('o')) #starts from end

print(text9.replace('o', 'O')) #replace small letters with capital

6.1.4 Case_when equivalent in Python

# Case_when equivalent with contains-statement


import pandas as pd
import numpy as np

dict = {'instrument_name':["volvo to2", "volvo", "skandia", "skandia teckningsratt hej", "seb bta"],
        'direction': ["buy", "sell", "buy", "sell", "buy"]}

# creating a dataframe from a dictionary
df = pd.DataFrame(dict, columns = ['instrument_name', 'direction'])

conditions = [
    (df['instrument_name'].str.contains("to2")),
    (df['instrument_name'].str.contains("bta")),
    (df['instrument_name'].str.contains("teckningsratt")),
    (df['instrument_name'].str.contains("teckningsratter")),
    (df['instrument_name'].str.contains("teckningsoption")),

]
choices = ["subscription_option","subscription_option","subscription_option","subscription_option", "subscription_option"]
df["type"] = np.select(conditions, choices)
print(df)

6.2 Scrapy

pip install Scrapy

6.2.1 Shell commands

Run scrapy in shell: Write ‘scrapy shell’ in miniconda

Fetch page: Write ‘Fetch (“https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-foretagscertifikat/auction-results/2020/results-of-auctions-2020-09-23/”)’

Use view(response)

6.2.2 Scrape table

# Spider

import scrapy


class ToScrapeCSSSpider(scrapy.Spider):
    name = "rb_scrape2"
    start_urls = [

        'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-foretagscertifikat/auction-results/2020/results-of-auctions-2020-09-23/'
    ]

    def parse(self, response):


        for item in response.xpath('//*[@class="page-base__main__body"]//tr'):
            yield {

            'terms' : item.xpath('td[1]//text()').extract(),
            'auction' : item.xpath('td[2]//text()').extract()

            }

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))


# Settings

#Export as CSV Feed
#FEED_FORMAT = "csv"
#FEED_URI = "rbbot.csv"

#Run crawler in miniconda with creating an csv (if its not in settings): scrapy crawl rb_scrape2 -o quotes.csv

6.2.3 Scrape table json treasury scraper

# Spider
# -*- coding: utf-8 -*-
#from scrapy import BaseSpider
import scrapy
#from TreasuryScraper.items import TreasuryItem
import json
class TreasurySpider(scrapy.Spider):
    name = 'treasury'
    start_urls = [
        'https://www.treasurydirect.gov/TA_WS/securities/jqsearch?format=json&filterscount=0&groupscount=0&pagenum=0&pagesize=1000&recordstartindex=0&recordendindex=1000',
    ]
    def parse(self, response):
        jsonresponse = json.loads(response.text)
        for item in jsonresponse['securityList']:
            yield {
                'cusip': item['cusip'],
                'securityType': item['securityType'],
                'securityTerm': item['securityTerm'],
                'offeringAmount': item['offeringAmount'],
                'tips': item['tips'],
                'type': item['type'],
                'pricePer100': item['pricePer100'],
                'floatingRate': item['floatingRate'],
                'reopening': item['reopening'],
                'auctionDate': item['auctionDate'],
                'maturityDate': item['maturityDate'],
                'term': item['term'],
                'competitiveAccepted': item['competitiveAccepted'],
                'allocationPercentage': item['allocationPercentage'],
                'averageMedianYield': item['averageMedianYield'],
                'bidToCoverRatio': item['bidToCoverRatio'],
                'competitiveAccepted': item['competitiveAccepted'],
                'highYield': item['highYield'],
                'lowYield': item['lowYield'],
                'somaAccepted': item['somaAccepted'],
                'somaHoldings': item['somaHoldings'],
                'primaryDealerAccepted': item['primaryDealerAccepted'],
                'directBidderAccepted': item['directBidderAccepted'],
                'directBidderTendered': item['directBidderTendered'],
                'indirectBidderAccepted': item['indirectBidderAccepted'],
                'indirectBidderTendered': item['indirectBidderTendered'],
                'interestPaymentFrequency': item['interestPaymentFrequency']

                }

#settings
# -*- coding: utf-8 -*-
#BOT_NAME = 'TreasuryScraper'
#SPIDER_MODULES = ['TreasuryScraper.spiders']
#NEWSPIDER_MODULE = 'TreasuryScraper.spiders'
#ROBOTSTXT_OBEY = False
#DOWNLOAD_DELAY = 60.0
#AUTOTHROTTLE_ENABLED = True
#HTTPCACHE_ENABLED = True 
#FEED_EXPORT_ENCODING = 'utf-8'

#scrapy crawl treasury -o output.csv

6.2.4 Scrape table json auction results Riksbank


import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from RiksbankAuctionScraper.items import GovernmentBond

def get_table_attr(response, x): # to handle strange tables
  # using normalize-space() to avoid "\n" as matching xpath
  xpaths = [
    './/td[contains(text(),"{0}")]/following-sibling::td/text()[normalize-space()]', #if td contains, use following sibling
    './/td[contains(text(),"{0}")]/following-sibling::td/p/text()[normalize-space()]',
    './/th/span[contains(text(),"{0}")]/../following-sibling::td/span/text()[normalize-space()]' #if th/span contains text, use following sibbling
  ]
  xpath_str = '|'.join(xpaths).format(x)
  return response.xpath(xpath_str).get()

class RiksbankSpider(CrawlSpider):
    name = "RiksbankAuctionScraper_v5"
    allowed_domains = ['riksbank.se']
    start_urls = [
      'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-statsobligationer/results-of-auctions'
    ]
    rules = (
      Rule(LinkExtractor(
        allow=('\/kop-av-statsobligationer\/results-of-auctions\/2020\/results-of-auctions-\d{4}-\d{2}-\d{2}\/$')
      ), callback='parse_government_bond'),
    )
    
    def parse_government_bond(self, response):
        for selector in response.xpath("//table"):
          item = GovernmentBond()
          item['auction_type'] = 'statsobligationer' 
          item['auction_date'] = get_table_attr(selector, "Auction date") #text to look for
          item['loan_number'] = get_table_attr(selector, "Loan")
          yield item
          
 #Items    
# -*- coding: utf-8 -*-
from scrapy import Item, Field

class GovernmentBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number= Field()

6.2.5 Scrape table RiksbankAuctionScraper

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from RiksbankAuctionScraper.items import GovernmentBonds, SekLending, FundingForLending, CommercialPapers, UsLending, CoveredBonds, MunicipalBonds

def get_table_attr(response, x):
  # using normalize-space() to avoid "\n" as matching xpath
  xpaths = [
    './/td[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/following-sibling::td/text()[normalize-space()]',
    './/td[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/following-sibling::td/*/text()',
    './/td/span[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/../following-sibling::td/span/text()',
    './/th/span[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/../following-sibling::td/span/text()'
  ]
  xpath_str = '|'.join(xpaths).format(x)
  return response.xpath(xpath_str).get()

class RiksbankSpider(CrawlSpider):
    name = "RiksbankAuctionScraper"
    allowed_domains = ['riksbank.se']
    start_urls = [
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-kommunobligationer/auction-results/',
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-statsobligationer/results-of-auctions/',
      'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-sakerstallda-obligationer/results-of-auctions/',
      #'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-foretagscertifikat/auction-results/',
      #'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/lan-till-bankerna-for-vidareutlaning-till-foretag/auction-results/',
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/lan-i-amerikanska-dollar/auction-results/',
    #  'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/veckovisa-extraordinara-marknadsoperationer/auction-results/'
    ]
    rules = (
      Rule(LinkExtractor(
        allow=('\/kop-av-statsobligationer\/results-of-auctions\/')
      ), callback='parse_bonds'),
      Rule(LinkExtractor(
        allow=('\/kop-av-sakerstallda-obligationer\/results-of-auctions\/2020\/')
      ), callback='parse_covered_bonds'),
      Rule(LinkExtractor(
        allow=('\/veckovisa-extraordinara-marknadsoperationer\/auction-results\/2020\/')
      ), callback='parse_sek_lending'),
      Rule(LinkExtractor(
        allow=('\/kop-av-foretagscertifikat\/auction-results\/2020\/')
      ), callback='parse_ftg_cert'),
      Rule(LinkExtractor(
        allow=('\/lan-till-bankerna-for-vidareutlaning-till-foretag\/auction-results\/2020\/')
      ), callback='parse_ffl'),
      Rule(LinkExtractor(
        allow=('\/lan-i-amerikanska-dollar\/auction-results\/2020\/')
      ), callback='parse_us_lending'),
      Rule(LinkExtractor(
        allow=('\/kop-av-kommunobligationer\/auction-results\/2020\/')
      ), callback='parse_municipal'),
    )

    def parse_bonds(self, response):
        for selector in response.xpath("//table"):
          item = GovernmentBonds()
          item['auction_type'] = 'governmentBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item

    def parse_covered_bonds(self, response):
        for selector in response.xpath("//table"):
          item = CoveredBonds()
          item['auction_type'] = 'coveredBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(response, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item


    def parse_sek_lending(self, response):
        for selector in response.xpath("//table"):
          item = SekLending()
          item['auction_type'] = 'sekLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['payment_date'] = get_table_attr(selector, "payment date")
          item['maturity_date'] = get_table_attr(selector, "maturity date")
          item['term'] = get_table_attr(selector, "term")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment")
          item['interest_rate'] = get_table_attr(selector, "interest rate")
          yield item

    def parse_ftg_cert(self, response):
        for selector in response.xpath("//table"):
          item = CommercialPapers()
          item['auction_type'] = 'commercialPapers'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['credit_rating_class'] = get_table_attr(selector, "credit rating class")
          item['term'] = get_table_attr(selector, "term")
          item['fixed_purchase_rate'] = get_table_attr(selector, "fixed purchase rate")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['volume_bought'] = get_table_attr(selector, "accepted volume")
          item['percentage_alloted'] = get_table_attr(selector, "percentage allotted")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          yield item

    def parse_ffl(self, response):
        for selector in response.xpath("//table"):
          item = FundingForLending()
          item['auction_type'] = 'fundingForLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['settlement_date'] = get_table_attr(selector, "settlement date")
          item['final_repayment_date'] = get_table_attr(selector, "final repayment date")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment")
          item['interest_rate'] = get_table_attr(selector, "interest rate")
          item['interest_rate_supplement'] = get_table_attr(selector, "interest rate supplement")
          yield item

    def parse_us_lending(self, response):
        for selector in response.xpath("//table"):
          item = UsLending()
          item['auction_type'] = 'usLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['settlement_date'] = get_table_attr(selector, "settlement date")
          item['maturity_date'] = get_table_attr(selector, "maturity date")
          item['term'] = get_table_attr(selector, "term")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['marginal_interest_rate'] = get_table_attr(selector, "marginal interest rate")
          item['allotment_at_marginal'] = get_table_attr(selector, "allotment at marginal")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment") #if item['allotment] == none. hmm?
          yield item

    def parse_municipal(self, response):
        for selector in response.xpath("//table"):
          item = MunicipalBonds()
          item['auction_type'] = 'municipalBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item


#items
# -*- coding: utf-8 -*-
from scrapy import Item, Field

class GovernmentBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  lowest_accepted_yield = Field()
  highest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class CoveredBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  lowest_accepted_yield = Field()
  highest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class MunicipalBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  highest_accepted_yield = Field()
  lowest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class SekLending(Item):
  auction_type = Field()
  auction_date = Field()
  payment_date = Field()
  maturity_date = Field()
  term = Field()
  offered_volume = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()
  interest_rate = Field()

class CommercialPapers(Item):
  auction_type = Field()
  auction_date = Field()
  credit_rating_class = Field()
  term = Field()
  fixed_purchase_rate = Field()
  total_bid_amount = Field()
  volume_bought = Field()
  percentage_alloted = Field()
  number_of_bids = Field()

class FundingForLending(Item):
  auction_type = Field()
  auction_date = Field()
  settlement_date = Field()
  final_repayment_date = Field()
  offered_volume = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()
  interest_rate = Field()
  interest_rate_supplement = Field()

class UsLending(Item):
  auction_type = Field()
  auction_date = Field()
  settlement_date = Field()
  maturity_date = Field()
  term = Field()
  offered_volume = Field()
  marginal_interest_rate = Field()
  allotment_at_marginal = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()


#settings

BOT_NAME = 'RiksbankAuctionScraper'

SPIDER_MODULES = ['RiksbankAuctionScraper.spiders']
NEWSPIDER_MODULE = 'RiksbankAuctionScraper.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ReportScraper (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

HTTPCACHE_ENABLED = True

DOWNLOAD_DELAY = 10.0


#crawl
#scrapy crawl RiksbankAuctionScraper -o output.json

###import it to R
library("rjson")
output <- fromJSON(file = "C:\\Users\\chris\\Documents\\Python\\Projects\\RiksbankAuctionScraper\\output.json")
output <- rbindlist(output, idcol = TRUE)

library(data.table)
df <- rbindlist(output, idcol = TRUE)




# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}


AUTOTHROTTLE_ENABLED = True

FEED_EXPORT_ENCODING = 'utf-8'

6.2.6 BeautifulSoup


mport urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl


# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

#text = soup.get_text() ##get all text
#print(text) ##print all text

#tables = soup.find_all("table") #find all tables
#print(tables)

#print(tables[1]["style"]) ##get the style element

#lists = soup.find_all("li") #find all list elements
#print(len(lists)) ##print number of list


links = soup.select("a[href*=minutes]")
#print(links)
for link in links:
    print(link.get('href'))

6.3 Misc

6.3.1 Python in R Markdown


#```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
py_install("pandas") ##install packages

#https://rstudio.github.io/reticulate/articles/python_packages.html

#https://stackoverflow.com/questions/54261906/reticulate-importerror-no-module-named-pandas-in-rstudio-version-1-2