import requests
from bs4 import BeautifulSoup
import datetime
# Set the URL parameters
today = datetime.date.today()
params = {
'advanced': '',
'terms-0-operator': 'AND',
'terms-0-term': '',
'terms-0-field': 'title',
'classification-physics': 'y',
'classification-physics_archives': 'quant-ph',
'classification-include_cross_list': 'include',
'date-year': '',
'date-filter_by': 'date_range',
'date-from_date': '1992-10-7',
'date-to_date': today,
'date-date_type': 'announced_date_first',
'abstracts': 'hide',
'size': '200', # Maximum number of articles per page
'order': '-announced_date_first', # Order by date, newest first
}
# Initialize the starting index and batch size
start_index = 0
batch_size = 200
# Open the output file in write mode
with open('./pre-pre-pre/arxiv_ids.txt', 'a') as f:
# Loop through all batches and extract the arXiv IDs
while True:
# Update the starting index in the URL parameters
params['start'] = start_index
# Make a GET request to the URL and get the HTML content
response = requests.get('
https://arxiv.org/search/advanced', params=params)
html_content = response.content
print(response.status_code)
# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
# Find all the elements with the tag 'p' and the class 'list-title is-inline-block'
arxiv_elements = soup.find_all('p', {'class': 'list-title is-inline-block'})
# Loop through each element and extract the arXiv ID
for element in arxiv_elements:
arxiv_id = element.find('a').text.strip()
# Write the arXiv ID to the output file
f.write(arxiv_id + '\n')
# If the number of elements found is less than the batch size, we have reached the end of the results
if len(arxiv_elements) < batch_size:
break
# Increase the starting index for the next batch
start_index += batch_size
E. J. Keskinoglu