6 Python

6.1 Basic

6.1.1 Regular Expressions

Regular expressions
^ Matches the beginning of a line
$ Matches the end of a line
. Matches any character
backslash s Matches whitespace
backslash S Matches any non-whitespace character
* Repeats a character zero or more times
*? Repeats a character zero or more times (non-greedy)
+ Repeats a character one or more times
+? Repeats a character one or more times (non-greedy)
[aeiou] Matches a single character in the listed set
[^YXZ] Matches a single character not in the listed set
[a-z0-9] The set of characters can include a range
( Indicates where string extraction is to start
) Indicates where string extraction is to end
/th> Matches word boundary
/th> any digit
any non-digit
any whitespace
any non-whitespace
any alphanumerical character
any non-alphanumerical character
{n} Exactly n repetitions
{n,} At least n repetitions
{,n} At most n repetitions
{m,n} At least m and at most n repetitions

6.1.3 Working with Text and Strings

#Working with Text

text1 = "Since February 2015, the Riksbank has purchased government bonds with the aim of making monetary policy more expansionary."

len(text1) #get number of characters

text2 = text1.split(' ') #split into words
len(text2) #get number of words
print(text2)

#finding specific words_fomc
print([w for w in text2 if len(w) > 3]) #get all words longer then 3 characters

#capitalized words
print([w for w in text2 if w.istitle()]) #use istitle to get words with capitalized first character

#words that end with s
print([w for w in text2 if w.endswith('s')]) #words that end with s


#Finding unique words
text3 = 'To be or not to be'
text4 = text3.split(' ')
print(len(text4))

print(set(text4)) ##finding unique words with set (but caps sensitive)

print(len(set([w.lower() for w in text4]))) ##only get lower
print(set([w.lower() for w in text4]))
#minut 4.

##check what it is
#s.startwidth(t)
#s.endswith(t)
#t in s #if a substring is in a bigger
#s.isupper(); s.islower(); s.istitle()
#s.isalpha(); s.isdigit(); s.isalnum() #if string is alphabetical, numerical, or both

##string operations (change the string)
#s.lower(); s.upper(); s.titlecase()
#s.split(t) #split depending on t
#s.splitlines() #split sentence line on new line character or end of line character
#s.join(t) #takes word t and join with s
#s.strip(); s.rstrip() ##strip, take out for example space. strip from front, rstrip from endswith
#s.find(t); s.rfind(t)
#s.replace(u,v)

###from words to characters
text5 = 'ouagadougou'
text6 = text5.split('ou') #split on ou
print(text6)
text6 = 'ou'.join(text6) #now you get back the full. split and join opposite.
print(text6)

#get all of the characters, do it as a list
print(list(text5))
print([c for c in text5])

###Cleaning text
text8 = '    A quick brown fox jumped over the lazy dog. '

text9 = text8.strip() #strips out white characters
print(text9.split(' ')) #split on space

###Changing text1
#find and replace
text9 = '    A quick brown fox jumped over the lazy dog. '
print(text9.find('o')) #starts from start

print(text9.rfind('o')) #starts from end

print(text9.replace('o', 'O')) #replace small letters with capital

6.2 Scrapy

pip install Scrapy

6.2.1 Shell commands

Run scrapy in shell: Write ‘scrapy shell’ in miniconda

Fetch page: Write ‘Fetch (“https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-foretagscertifikat/auction-results/2020/results-of-auctions-2020-09-23/”)’

Use view(response)

6.2.3 Scrape table json treasury scraper

# Spider
# -*- coding: utf-8 -*-
#from scrapy import BaseSpider
import scrapy
#from TreasuryScraper.items import TreasuryItem
import json
class TreasurySpider(scrapy.Spider):
    name = 'treasury'
    start_urls = [
        'https://www.treasurydirect.gov/TA_WS/securities/jqsearch?format=json&filterscount=0&groupscount=0&pagenum=0&pagesize=1000&recordstartindex=0&recordendindex=1000',
    ]
    def parse(self, response):
        jsonresponse = json.loads(response.text)
        for item in jsonresponse['securityList']:
            yield {
                'cusip': item['cusip'],
                'securityType': item['securityType'],
                'securityTerm': item['securityTerm'],
                'offeringAmount': item['offeringAmount'],
                'tips': item['tips'],
                'type': item['type'],
                'pricePer100': item['pricePer100'],
                'floatingRate': item['floatingRate'],
                'reopening': item['reopening'],
                'auctionDate': item['auctionDate'],
                'maturityDate': item['maturityDate'],
                'term': item['term'],
                'competitiveAccepted': item['competitiveAccepted'],
                'allocationPercentage': item['allocationPercentage'],
                'averageMedianYield': item['averageMedianYield'],
                'bidToCoverRatio': item['bidToCoverRatio'],
                'competitiveAccepted': item['competitiveAccepted'],
                'highYield': item['highYield'],
                'lowYield': item['lowYield'],
                'somaAccepted': item['somaAccepted'],
                'somaHoldings': item['somaHoldings'],
                'primaryDealerAccepted': item['primaryDealerAccepted'],
                'directBidderAccepted': item['directBidderAccepted'],
                'directBidderTendered': item['directBidderTendered'],
                'indirectBidderAccepted': item['indirectBidderAccepted'],
                'indirectBidderTendered': item['indirectBidderTendered'],
                'interestPaymentFrequency': item['interestPaymentFrequency']

                }

#settings
# -*- coding: utf-8 -*-
#BOT_NAME = 'TreasuryScraper'
#SPIDER_MODULES = ['TreasuryScraper.spiders']
#NEWSPIDER_MODULE = 'TreasuryScraper.spiders'
#ROBOTSTXT_OBEY = False
#DOWNLOAD_DELAY = 60.0
#AUTOTHROTTLE_ENABLED = True
#HTTPCACHE_ENABLED = True 
#FEED_EXPORT_ENCODING = 'utf-8'

#scrapy crawl treasury -o output.csv

6.2.4 Scrape table json auction results Riksbank


import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from RiksbankAuctionScraper.items import GovernmentBond

def get_table_attr(response, x): # to handle strange tables
  # using normalize-space() to avoid "\n" as matching xpath
  xpaths = [
    './/td[contains(text(),"{0}")]/following-sibling::td/text()[normalize-space()]', #if td contains, use following sibling
    './/td[contains(text(),"{0}")]/following-sibling::td/p/text()[normalize-space()]',
    './/th/span[contains(text(),"{0}")]/../following-sibling::td/span/text()[normalize-space()]' #if th/span contains text, use following sibbling
  ]
  xpath_str = '|'.join(xpaths).format(x)
  return response.xpath(xpath_str).get()

class RiksbankSpider(CrawlSpider):
    name = "RiksbankAuctionScraper_v5"
    allowed_domains = ['riksbank.se']
    start_urls = [
      'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-statsobligationer/results-of-auctions'
    ]
    rules = (
      Rule(LinkExtractor(
        allow=('\/kop-av-statsobligationer\/results-of-auctions\/2020\/results-of-auctions-\d{4}-\d{2}-\d{2}\/$')
      ), callback='parse_government_bond'),
    )
    
    def parse_government_bond(self, response):
        for selector in response.xpath("//table"):
          item = GovernmentBond()
          item['auction_type'] = 'statsobligationer' 
          item['auction_date'] = get_table_attr(selector, "Auction date") #text to look for
          item['loan_number'] = get_table_attr(selector, "Loan")
          yield item
          
 #Items    
# -*- coding: utf-8 -*-
from scrapy import Item, Field

class GovernmentBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number= Field()         

6.2.5 Scrape table RiksbankAuctionScraper

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from RiksbankAuctionScraper.items import GovernmentBonds, SekLending, FundingForLending, CommercialPapers, UsLending, CoveredBonds, MunicipalBonds

def get_table_attr(response, x):
  # using normalize-space() to avoid "\n" as matching xpath
  xpaths = [
    './/td[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/following-sibling::td/text()[normalize-space()]',
    './/td[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/following-sibling::td/*/text()',
    './/td/span[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/../following-sibling::td/span/text()',
    './/th/span[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"),"{0}")]/../following-sibling::td/span/text()'
  ]
  xpath_str = '|'.join(xpaths).format(x)
  return response.xpath(xpath_str).get()

class RiksbankSpider(CrawlSpider):
    name = "RiksbankAuctionScraper"
    allowed_domains = ['riksbank.se']
    start_urls = [
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-kommunobligationer/auction-results/',
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-statsobligationer/results-of-auctions/',
      'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-sakerstallda-obligationer/results-of-auctions/',
      #'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/kop-av-foretagscertifikat/auction-results/',
      #'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/lan-till-bankerna-for-vidareutlaning-till-foretag/auction-results/',
     # 'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/lan-i-amerikanska-dollar/auction-results/',
    #  'https://www.riksbank.se/sv/penningpolitik/penningpolitiska-instrument/veckovisa-extraordinara-marknadsoperationer/auction-results/'
    ]
    rules = (
      Rule(LinkExtractor(
        allow=('\/kop-av-statsobligationer\/results-of-auctions\/')
      ), callback='parse_bonds'),
      Rule(LinkExtractor(
        allow=('\/kop-av-sakerstallda-obligationer\/results-of-auctions\/2020\/')
      ), callback='parse_covered_bonds'),
      Rule(LinkExtractor(
        allow=('\/veckovisa-extraordinara-marknadsoperationer\/auction-results\/2020\/')
      ), callback='parse_sek_lending'),
      Rule(LinkExtractor(
        allow=('\/kop-av-foretagscertifikat\/auction-results\/2020\/')
      ), callback='parse_ftg_cert'),
      Rule(LinkExtractor(
        allow=('\/lan-till-bankerna-for-vidareutlaning-till-foretag\/auction-results\/2020\/')
      ), callback='parse_ffl'),
      Rule(LinkExtractor(
        allow=('\/lan-i-amerikanska-dollar\/auction-results\/2020\/')
      ), callback='parse_us_lending'),
      Rule(LinkExtractor(
        allow=('\/kop-av-kommunobligationer\/auction-results\/2020\/')
      ), callback='parse_municipal'),
    )

    def parse_bonds(self, response):
        for selector in response.xpath("//table"):
          item = GovernmentBonds()
          item['auction_type'] = 'governmentBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item

    def parse_covered_bonds(self, response):
        for selector in response.xpath("//table"):
          item = CoveredBonds()
          item['auction_type'] = 'coveredBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(response, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item


    def parse_sek_lending(self, response):
        for selector in response.xpath("//table"):
          item = SekLending()
          item['auction_type'] = 'sekLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['payment_date'] = get_table_attr(selector, "payment date")
          item['maturity_date'] = get_table_attr(selector, "maturity date")
          item['term'] = get_table_attr(selector, "term")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment")
          item['interest_rate'] = get_table_attr(selector, "interest rate")
          yield item

    def parse_ftg_cert(self, response):
        for selector in response.xpath("//table"):
          item = CommercialPapers()
          item['auction_type'] = 'commercialPapers'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['credit_rating_class'] = get_table_attr(selector, "credit rating class")
          item['term'] = get_table_attr(selector, "term")
          item['fixed_purchase_rate'] = get_table_attr(selector, "fixed purchase rate")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['volume_bought'] = get_table_attr(selector, "accepted volume")
          item['percentage_alloted'] = get_table_attr(selector, "percentage allotted")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          yield item

    def parse_ffl(self, response):
        for selector in response.xpath("//table"):
          item = FundingForLending()
          item['auction_type'] = 'fundingForLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['settlement_date'] = get_table_attr(selector, "settlement date")
          item['final_repayment_date'] = get_table_attr(selector, "final repayment date")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment")
          item['interest_rate'] = get_table_attr(selector, "interest rate")
          item['interest_rate_supplement'] = get_table_attr(selector, "interest rate supplement")
          yield item

    def parse_us_lending(self, response):
        for selector in response.xpath("//table"):
          item = UsLending()
          item['auction_type'] = 'usLending'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['settlement_date'] = get_table_attr(selector, "settlement date")
          item['maturity_date'] = get_table_attr(selector, "maturity date")
          item['term'] = get_table_attr(selector, "term")
          item['offered_volume'] = get_table_attr(selector, "offered volume")
          item['marginal_interest_rate'] = get_table_attr(selector, "marginal interest rate")
          item['allotment_at_marginal'] = get_table_attr(selector, "allotment at marginal")
          item['total_bid_amount'] = get_table_attr(selector, "total bid amount")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['allotment'] = get_table_attr(selector, "allotment") #if item['allotment] == none. hmm?
          yield item

    def parse_municipal(self, response):
        for selector in response.xpath("//table"):
          item = MunicipalBonds()
          item['auction_type'] = 'municipalBonds'
          item['auction_date'] = get_table_attr(selector, "auction date")
          item['loan_number'] = get_table_attr(selector, "loan")
          item['isin'] = get_table_attr(selector, "isin")
          item['coupon'] = get_table_attr(selector, "coupon")
          item['tendered_volume'] = get_table_attr(selector, "tendered")
          item['volume_offered'] = get_table_attr(selector, "offered")
          item['volume_bought'] = get_table_attr(selector, "bought")
          item['number_of_bids'] = get_table_attr(selector, "number of bids")
          item['number_of_accepted_bids'] = get_table_attr(selector, "number of accepted bids")
          item['average_yield'] = get_table_attr(selector, "average yield")
          item['lowest_accepted_yield'] = get_table_attr(selector, "lowest accepted yield")
          item['highest_accepted_yield'] = get_table_attr(selector, "highest yield")
          item['accepted_at_lowest_perc'] = get_table_attr(selector, "accepted at lowest")
          yield item


#items
# -*- coding: utf-8 -*-
from scrapy import Item, Field

class GovernmentBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  lowest_accepted_yield = Field()
  highest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class CoveredBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  lowest_accepted_yield = Field()
  highest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class MunicipalBonds(Item):
  auction_type = Field()
  auction_date = Field()
  loan_number = Field()
  isin = Field()
  coupon = Field()
  tendered_volume = Field()
  volume_offered = Field()
  volume_bought = Field()
  number_of_bids = Field()
  number_of_accepted_bids = Field()
  average_yield = Field()
  highest_accepted_yield = Field()
  lowest_accepted_yield = Field()
  accepted_at_lowest_perc = Field()
  #auction_date = Field()
  auction_type = Field()

class SekLending(Item):
  auction_type = Field()
  auction_date = Field()
  payment_date = Field()
  maturity_date = Field()
  term = Field()
  offered_volume = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()
  interest_rate = Field()

class CommercialPapers(Item):
  auction_type = Field()
  auction_date = Field()
  credit_rating_class = Field()
  term = Field()
  fixed_purchase_rate = Field()
  total_bid_amount = Field()
  volume_bought = Field()
  percentage_alloted = Field()
  number_of_bids = Field()

class FundingForLending(Item):
  auction_type = Field()
  auction_date = Field()
  settlement_date = Field()
  final_repayment_date = Field()
  offered_volume = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()
  interest_rate = Field()
  interest_rate_supplement = Field()

class UsLending(Item):
  auction_type = Field()
  auction_date = Field()
  settlement_date = Field()
  maturity_date = Field()
  term = Field()
  offered_volume = Field()
  marginal_interest_rate = Field()
  allotment_at_marginal = Field()
  total_bid_amount = Field()
  number_of_bids = Field()
  allotment = Field()


#settings

BOT_NAME = 'RiksbankAuctionScraper'

SPIDER_MODULES = ['RiksbankAuctionScraper.spiders']
NEWSPIDER_MODULE = 'RiksbankAuctionScraper.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ReportScraper (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

HTTPCACHE_ENABLED = True

DOWNLOAD_DELAY = 10.0


#crawl
#scrapy crawl RiksbankAuctionScraper -o output.json

###import it to R
library("rjson")
output <- fromJSON(file = "C:\\Users\\chris\\Documents\\Python\\Projects\\RiksbankAuctionScraper\\output.json")
output <- rbindlist(output, idcol = TRUE)

library(data.table)
df <- rbindlist(output, idcol = TRUE)




# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}


AUTOTHROTTLE_ENABLED = True

FEED_EXPORT_ENCODING = 'utf-8'