from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
# Function to extract table data from a container based on a CSS selector
def extract_table_data(container, selector):
try:
table = container.find_element(By.CSS_SELECTOR, selector)
rows = table.find_elements(By.TAG_NAME, 'tr')
data = {}
for row in rows:
cols = row.find_elements(By.TAG_NAME, 'td')
if len(cols) == 2:
key = cols[0].text.strip()
value = cols[1].text.strip()
data[key] = value
return data
except NoSuchElementException:
print(f"No table found with selector: {selector}")
return {}
# Function to collect data from all links in the table on the current page
def extract_data_from_page(driver, data):
try:
tbody = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, 'tbody'))
)
print(f'Found table on current page')
rows = tbody.find_elements(By.TAG_NAME, 'tr')
for row in rows:
try:
cols = row.find_elements(By.TAG_NAME, 'td')
url = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href')
title = cols[1].text.strip()
stored_dict = {'url': url, 'title': title}
data.append(stored_dict)
except NoSuchElementException:
print("No link found in this row.")
except TimeoutException:
print("No table found on the current page.")
return False
return True
# Function to collect info from each link
def extract_data_from_url(driver, item):
url = item['url']
driver.get(url)
print(f"Extracting data from URL: {url}")
try:
opportunity_container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'opportunity-container'))
)
print('Found opportunity container')
general_info_data = extract_table_data(opportunity_container, 'h2.margin-0 + div table.usa-table')
eligibility_data = extract_table_data(opportunity_container, 'h2.margin-0:nth-of-type(2) + div table.usa-table')
additional_info_data = extract_table_data(opportunity_container, 'h2.margin-0:nth-of-type(3) + div table.usa-table')
item['General Information'] = general_info_data
item['Eligibility Information'] = eligibility_data
item['Additional Information'] = additional_info_data
print("General Information:")
for key, value in general_info_data.items():
print(f"{key}: {value}")
print("\nEligibility Information:")
for key, value in eligibility_data.items():
print(f"{key}: {value}")
print("\nAdditional Information:")
for key, value in additional_info_data.items():
print(f"{key}: {value}")
except TimeoutException:
print(f"No opportunity container found on URL: {url}")
# Function to click the next button on the page
def click_next_button(driver):
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, 'usa-button usa-button--unstyled usa-pagination__link usa-pagination__next-page'))
)
if next_button.get_attribute('aria-disabled') == 'true':
print("Next button disabled, end of pages.")
return False
else:
next_button.click()
return True
except TimeoutException:
print("Next button not found, end of pages.")
print(driver.page_source) # Print page source for debugging
return False
# Main function to run the script
def main():
driver = webdriver.Chrome()
driver.get(url)
data = []
page_count = 0
while True:
page_count += 1
print(f"Now on page {page_count}")
if not extract_data_from_page(driver, data):
break
for item in data:
extract_data_from_url(driver, item)
if not click_next_button(driver):
break
driver.quit()
print(f"Total pages navigated: {page_count}")
if __name__ == "__main__":
main()