-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathscraper.py
More file actions
154 lines (126 loc) · 5.32 KB
/
scraper.py
File metadata and controls
154 lines (126 loc) · 5.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# # initialize an instance of the chrome driver (browser)
# driver = webdriver.Chrome()
# # enable headless mode in Selenium
# options = Options()
# # enable headless mode
# options.headless = True
# options.add_argument('--headless=new')
# driver = webdriver.Chrome(
# options=options,
# # other properties...
# )
# # visit your target site
# driver.get('https://scrapingclub.com/')
# # scraping logic...
# exercise1_card = driver.find_element(By.CLASS_NAME, 'w-full.rounded.border') #w-full rounded border
# # or
# # exercise1_card = driver.find_element(By.CSS_SELECTOR, '.w-full.rounded.border')
# # or
# # exercise1_card = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div/div[1]')
# print(exercise1_card)
# # release the resources allocated by Selenium and shut down the browser
# driver.quit()
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# initialize an instance of the chrome driver (browser)
options = Options()
options.headless = True
options.add_argument('--headless=new')
driver = webdriver.Chrome(options=options)
try:
# visit your target site
driver.get('https://scrapingclub.com/')
# wait for the element to be present
wait = WebDriverWait(driver, 10)
exercise1_card = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'w-full.rounded.border')))
print(exercise1_card.text)
except Exception as e:
print(f"An error occurred: {e}")
finally:
# release the resources allocated by Selenium and shut down the browser
driver.quit()
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
# Function to initialize the Selenium WebDriver
def init_driver(headless=True):
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
return driver
# Function to find all elements using different strategies
def find_elements(driver, strategy, value):
wait = WebDriverWait(driver, 10)
try:
if strategy == "class_name":
locator = (By.CLASS_NAME, value)
elif strategy == "css_selector":
locator = (By.CSS_SELECTOR, value)
elif strategy == "xpath":
locator = (By.XPATH, value)
else:
raise ValueError(
f"Invalid strategy '{strategy}'. Use 'class_name', 'css_selector', or 'xpath'."
)
return wait.until(EC.presence_of_all_elements_located(locator))
except Exception as e:
error_message = f"Error finding elements using {strategy} with value '{value}': {e}"
print(error_message)
return []
# Function to scrape a webpage and export data
def scrape_website():
driver = init_driver(headless=True)
websiteInp = input("Enter URL for desired website to scrape: ") #exapmle : https://www.flipkart.com/search?q=laptop&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off
elementInp = input("Enter the class_name of the desired element to scrape: ") #example : KzDlHZ
export_format = input("Choose export format (CSV or JSON): ").strip().lower()
try:
# Open the target website
driver.get(websiteInp)
time.sleep(5) # Adding a 5-second wait after loading the page
# Find elements
elements = find_elements(driver, "class_name", elementInp)
# testWebsite = "https://scrapingclub.com/"
# testElement = "w-full.rounded.border"
# exercise1_card = find_element(driver, "class_name", elementInp) # Use testElement here instead of elementInp
# you can also use:
# exercise1_card = find_element(driver, 'css_selector', '.w-full.rounded.border')
# exercise1_card = find_element(driver, 'xpath', '/html/body/div[3]/div[2]/div/div[1]')
# Extract and format data
data = [{"content": el.text} for el in elements if el.text]
if not data:
print("No data found to export.")
return
# Export data
if export_format == 'csv':
with open("scraped_data.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["content"])
writer.writeheader()
writer.writerows(data)
print("Data exported to 'scraped_data.csv'.")
elif export_format == 'json':
with open("scraped_data.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Data exported to 'scraped_data.json'.")
else:
print("Invalid format selected. Please choose CSV or JSON.")
except ValueError as e:
print(e)
except Exception as e:
print(f"An error occurred: {e}")
finally:
driver.quit()
if __name__ == "__main__":
scrape_website()