|
@ -144,6 +144,20 @@ def login(driver): |
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
(By.XPATH, '//input[@name="search"]'))) |
|
|
(By.XPATH, '//input[@name="search"]'))) |
|
|
|
|
|
|
|
|
|
|
|
def relogin(driver): |
|
|
|
|
|
# entering username and password into input boxes |
|
|
|
|
|
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') |
|
|
|
|
|
# Username here |
|
|
|
|
|
usernameBox.send_keys('beachyoga278') # sends string to the username box |
|
|
|
|
|
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') |
|
|
|
|
|
# Password here |
|
|
|
|
|
passwordBox.send_keys('sunfish278') # sends string to passwordBox |
|
|
|
|
|
|
|
|
|
|
|
input("Press ENTER when CAPTCHA is completed\n") |
|
|
|
|
|
|
|
|
|
|
|
# wait for listing page show up (This Xpath may need to change based on different seed url) |
|
|
|
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
|
|
|
(By.XPATH, '//input[@name="search"]'))) |
|
|
|
|
|
|
|
|
# Saves the crawled html page, makes the directory path for html pages if not made |
|
|
# Saves the crawled html page, makes the directory path for html pages if not made |
|
|
def savePage(driver, page, url): |
|
|
def savePage(driver, page, url): |
|
@ -222,7 +236,10 @@ def crawlForum(driver): |
|
|
|
|
|
|
|
|
while has_next_page: |
|
|
while has_next_page: |
|
|
try: |
|
|
try: |
|
|
driver.get(link) |
|
|
|
|
|
|
|
|
if driver.findElements(by=By.XPATH, value='//input[@name="username"]').size() > 0: |
|
|
|
|
|
relogin(driver) |
|
|
|
|
|
else: |
|
|
|
|
|
driver.get(link) |
|
|
except: |
|
|
except: |
|
|
driver.refresh() |
|
|
driver.refresh() |
|
|
html = driver.page_source |
|
|
html = driver.page_source |
|
@ -232,7 +249,10 @@ def crawlForum(driver): |
|
|
for item in list: |
|
|
for item in list: |
|
|
itemURL = urlparse.urljoin(baseURL, str(item)) |
|
|
itemURL = urlparse.urljoin(baseURL, str(item)) |
|
|
try: |
|
|
try: |
|
|
driver.get(itemURL) |
|
|
|
|
|
|
|
|
if driver.findElements(by=By.XPATH, value='//input[@name="username"]').size() > 0: |
|
|
|
|
|
relogin(driver) |
|
|
|
|
|
else: |
|
|
|
|
|
driver.get(itemURL) |
|
|
except: |
|
|
except: |
|
|
driver.refresh() |
|
|
driver.refresh() |
|
|
savePage(driver, driver.page_source, item) |
|
|
savePage(driver, driver.page_source, item) |
|
|