|
@ -45,7 +45,7 @@ def startCrawling(): |
|
|
print(driver.current_url, e) |
|
|
print(driver.current_url, e) |
|
|
closeDriver(driver) |
|
|
closeDriver(driver) |
|
|
|
|
|
|
|
|
new_parse(mktName, BASE_URL, False) |
|
|
|
|
|
|
|
|
# new_parse(mktName, BASE_URL, False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Returns the name of the website |
|
|
# Returns the name of the website |
|
@ -129,8 +129,35 @@ def getAccess(): |
|
|
# then allows for manual solving of captcha in the terminal |
|
|
# then allows for manual solving of captcha in the terminal |
|
|
#@param: current selenium web driver |
|
|
#@param: current selenium web driver |
|
|
def login(driver): |
|
|
def login(driver): |
|
|
input("Press ENTER when CAPTCHA is completed and page is loaded\n") |
|
|
|
|
|
|
|
|
input("Press ENTER when CAPTCHA is completed and LOGIN page is loaded\n") |
|
|
|
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
|
|
|
(By.XPATH, '//*[@id="username"]'))) |
|
|
# wait for page to show up (This Xpath may need to change based on different seed url) |
|
|
# wait for page to show up (This Xpath may need to change based on different seed url) |
|
|
|
|
|
# entering username and password into input boxes |
|
|
|
|
|
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') |
|
|
|
|
|
# Username here |
|
|
|
|
|
usernameBox.send_keys('LordTachonky') |
|
|
|
|
|
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') |
|
|
|
|
|
# Password here |
|
|
|
|
|
passwordBox.send_keys('BorderRanked') |
|
|
|
|
|
|
|
|
|
|
|
input("Press ENTER when CAPTCHA is finished\n") |
|
|
|
|
|
|
|
|
|
|
|
login = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[4]') |
|
|
|
|
|
login.click() |
|
|
|
|
|
|
|
|
|
|
|
# wait for listing page show up (This Xpath may need to change based on different seed url) |
|
|
|
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
|
|
|
(By.XPATH, '/html/body/div/center'))) |
|
|
|
|
|
|
|
|
|
|
|
pinBox = driver.find_element(by=By.XPATH, value='//*[@id="pin"]') |
|
|
|
|
|
pinBox.send_keys('541236') |
|
|
|
|
|
submit = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[2]') |
|
|
|
|
|
submit.click() |
|
|
|
|
|
|
|
|
|
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
|
|
|
(By.XPATH, '/html/body/div[1]/p'))) |
|
|
|
|
|
# driver.find_element(by=By.XPATH, value='/html/body/div[1]/label').click() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Saves the crawled html page, makes the directory path for html pages if not made |
|
|
# Saves the crawled html page, makes the directory path for html pages if not made |
|
@ -210,33 +237,30 @@ def crawlForum(driver): |
|
|
driver.get(link) |
|
|
driver.get(link) |
|
|
except: |
|
|
except: |
|
|
driver.refresh() |
|
|
driver.refresh() |
|
|
|
|
|
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( |
|
|
|
|
|
(By.XPATH, '/html/body/div[3]/div/table/tbody/tr/td[2]/center/table/tbody/tr[1]/td[1]'))) |
|
|
html = driver.page_source |
|
|
html = driver.page_source |
|
|
savePage(driver, html, link) |
|
|
savePage(driver, html, link) |
|
|
|
|
|
|
|
|
list = productPages(html) |
|
|
list = productPages(html) |
|
|
|
|
|
|
|
|
for item in list: |
|
|
for item in list: |
|
|
itemURL = urlparse.urljoin(BASE_URL, str(item)) |
|
|
|
|
|
|
|
|
itemURL = urlparse.urljoin(getFixedURL(), str(item)) |
|
|
try: |
|
|
try: |
|
|
# time.sleep(1.5) # to keep from detecting click speed |
|
|
|
|
|
driver.get(itemURL) |
|
|
driver.get(itemURL) |
|
|
except: |
|
|
except: |
|
|
driver.refresh() |
|
|
driver.refresh() |
|
|
savePage(driver, driver.page_source, item) |
|
|
savePage(driver, driver.page_source, item) |
|
|
# time.sleep(1.5) |
|
|
|
|
|
driver.back() |
|
|
driver.back() |
|
|
# to keep from detecting click speed |
|
|
|
|
|
|
|
|
|
|
|
# comment out |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
# comment out |
|
|
|
|
|
if count == 1: |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
# # comment out |
|
|
|
|
|
# break |
|
|
|
|
|
# |
|
|
|
|
|
# # comment out |
|
|
|
|
|
# if count == 1: |
|
|
|
|
|
# break |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]') |
|
|
|
|
|
# a = nav.find_element(by=By.LINK_TEXT, value=">") |
|
|
|
|
|
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') |
|
|
|
|
|
|
|
|
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') |
|
|
if link == "": |
|
|
if link == "": |
|
|
raise NoSuchElementException |
|
|
raise NoSuchElementException |
|
|
count += 1 |
|
|
count += 1 |
|
@ -246,7 +270,6 @@ def crawlForum(driver): |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print(link, e) |
|
|
print(link, e) |
|
|
# raise e |
|
|
|
|
|
i += 1 |
|
|
i += 1 |
|
|
|
|
|
|
|
|
input("Crawling Torzon market done sucessfully. Press ENTER to continue\n") |
|
|
input("Crawling Torzon market done sucessfully. Press ENTER to continue\n") |
|
|