Web-Scraping-with-Selenium-and-Python/quora.py at master · pythonprogramming-development/Web-Scraping-with-Selenium-and-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

space_url = "https://neweraofcoding.quora.com/"

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")

driver = webdriver.Chrome(options=options)
driver.get(space_url)
time.sleep(3)

# Scroll to load more posts
SCROLL_PAUSE = 2
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Extract posts
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

posts = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag["href"]
    if href.startswith("/post/") or "quora.com/post/" in href:
        full_url = "https://www.quora.com" + href if href.startswith("/") else href
        title = a_tag.get_text().strip()
        if title and full_url not in [p[1] for p in posts]:
            posts.append((title, full_url))

# Output as Markdown
markdown = "\n".join(f"- [{title}]({link})" for title, link in posts)
print(markdown)
print(f"\nTotal posts found: {len(posts)}")

# Optional: Save to file
with open("quora_space_posts.md", "w", encoding="utf-8") as f:
    f.write(markdown)