JRA内部urlからBeautifulSoupを得る関数
JRA内部urlに2種ある
href="#" onclick="return doAction('/JRADB/accessS.html', 'pw01srl10092023020420230402/F2')
href="/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22"
これらのurlからBeautifulSoupを得る方法
onclickのurlだと
import requests
from bs4 import BeautifulSoup
def url2soup(url):
arg1 = url[0]
arg2 = url[1]
r = requests.post("https://www.jra.go.jp/"+arg1, data='cname='+arg2)
r.encoding = 'Shift_jis'
return BeautifulSoup(r.text, 'lxml')
url_onclick = '/JRADB/accessS.html', 'pw01srl10092023020420230402/F2'
soup = url2soup(url_onclick)
print(soup.find('title'))
cnameのurlだと
import requests
from bs4 import BeautifulSoup
url_cname = '/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22'
soup = BeautifulSoup(requests.get(url_cname).content, 'html.parser')
print(soup.find('title'))
使い分け面倒なのでこれにする
import requests
from bs4 import BeautifulSoup
def url2soup(url):
arg1 = url[0]
arg2 = url[1]
if '?CNAME=' in str(url):
arg1 = url.split('?CNAME=')[0]
arg2 = url.split('?CNAME=')[1]
r = requests.post("https://www.jra.go.jp/"+arg1, data='cname='+arg2)
r.encoding = 'Shift_jis'
return BeautifulSoup(r.text, 'lxml')
url_onclick = '/JRADB/accessS.html', 'pw01srl10092023020420230402/F2'
soup = url2soup(url_onclick)
print(soup.find('title'))
url_cname = '/JRADB/accessS.html?CNAME=pw01sde1006202303030120230401/22'
soup = url2soup(url_cname)
print(soup.find('title'))