JRA内部urlからBeautifulSoupを得る関数

 

JRA内部urlに2種ある
href="#" onclick="return doAction('/JRADB/accessS.html', 'pw01srl10092023020420230402/F2')
href="/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22"
これらのurlからBeautifulSoupを得る方法

 


onclickのurlだと


import requests
from bs4 import BeautifulSoup

def url2soup(url):
    arg1 = url[0]
    arg2 = url[1] 
    r = requests.post("https://www.jra.go.jp/"+arg1, data='cname='+arg2)
    r.encoding = 'Shift_jis'
    return BeautifulSoup(r.text, 'lxml') 

url_onclick = '/JRADB/accessS.html', 'pw01srl10092023020420230402/F2'
soup = url2soup(url_onclick)
print(soup.find('title'))

cnameのurlだと


import requests
from bs4 import BeautifulSoup

url_cname = '/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22'
soup = BeautifulSoup(requests.get(url_cname).content, 'html.parser')
print(soup.find('title'))

使い分け面倒なのでこれにする


import requests
from bs4 import BeautifulSoup

def url2soup(url):
    arg1 = url[0]
    arg2 = url[1] 
    if '?CNAME=' in str(url):
        arg1 = url.split('?CNAME=')[0]
        arg2 = url.split('?CNAME=')[1]
    r = requests.post("https://www.jra.go.jp/"+arg1, data='cname='+arg2)
    r.encoding = 'Shift_jis'
    return BeautifulSoup(r.text, 'lxml')

url_onclick = '/JRADB/accessS.html', 'pw01srl10092023020420230402/F2'
soup = url2soup(url_onclick)
print(soup.find('title')) 

url_cname = '/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22'
soup = url2soup(url_cname)
print(soup.find('title'))