Browse Source

debugged forums

main
westernmeadow 1 year ago
parent
commit
baa6974be7
16 changed files with 149 additions and 129 deletions
  1. +0
    -1
      .idea/DW_Pipeline_Test.iml
  2. +13
    -11
      Forums/AbyssForum/crawler_selenium.py
  3. +7
    -5
      Forums/Altenens/crawler_selenium.py
  4. +8
    -6
      Forums/BestCardingWorld/crawler_selenium.py
  5. +26
    -24
      Forums/BestCardingWorld/parser.py
  6. +14
    -12
      Forums/Cardingleaks/crawler_selenium.py
  7. +12
    -10
      Forums/CryptBB/crawler_selenium.py
  8. +1
    -1
      Forums/CryptBB/parser.py
  9. +12
    -10
      Forums/HiddenAnswers/crawler_selenium.py
  10. +5
    -2
      Forums/HiddenAnswers/parser.py
  11. +1
    -9
      Forums/Initialization/forumsList.txt
  12. +5
    -0
      Forums/Initialization/prepare_parser.py
  13. +13
    -11
      Forums/Libre/crawler_selenium.py
  14. +12
    -10
      Forums/OnniForums/crawler_selenium.py
  15. +8
    -7
      Forums/OnniForums/parser.py
  16. +12
    -10
      Forums/Procrax/crawler_selenium.py

+ 0
- 1
.idea/DW_Pipeline_Test.iml View File

@ -22,7 +22,6 @@
<option value="$MODULE_DIR$/MarketPlaces/TorBay" /> <option value="$MODULE_DIR$/MarketPlaces/TorBay" />
<option value="$MODULE_DIR$/MarketPlaces/TorMarket" /> <option value="$MODULE_DIR$/MarketPlaces/TorMarket" />
<option value="$MODULE_DIR$/MarketPlaces/ViceCity" /> <option value="$MODULE_DIR$/MarketPlaces/ViceCity" />
<option value="$MODULE_DIR$/Forums/AbyssForum" />
<option value="$MODULE_DIR$/Forums/Altenens" /> <option value="$MODULE_DIR$/Forums/Altenens" />
<option value="$MODULE_DIR$/Forums/Cardingleaks" /> <option value="$MODULE_DIR$/Forums/Cardingleaks" />
<option value="$MODULE_DIR$/Forums/HiddenAnswers" /> <option value="$MODULE_DIR$/Forums/HiddenAnswers" />


+ 13
- 11
Forums/AbyssForum/crawler_selenium.py View File

@ -32,15 +32,15 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, True) new_parse(forumName, baseURL, True)
@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
def getAccess(): def getAccess():
@ -241,14 +243,14 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:
break break
try: try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
count += 1 count += 1


+ 7
- 5
Forums/Altenens/crawler_selenium.py View File

@ -42,7 +42,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, True)
new_parse(forumName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -118,8 +118,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -136,6 +136,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -253,7 +255,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:
@ -272,7 +274,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
print("Crawling the Altenens forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website # Returns 'True' if the link is Topic link, may need to change for every website


+ 8
- 6
Forums/BestCardingWorld/crawler_selenium.py View File

@ -114,6 +114,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -238,8 +240,8 @@ def crawlForum(driver):
try: try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul') nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul')
li = nav.find_element_by_class_name('next')
page = li.find_element_by_tag_name('a').get_attribute('href')
li = nav.find_element(by=By.CLASS_NAME, value='next')
page = li.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if page == "": if page == "":
raise NoSuchElementException raise NoSuchElementException
counter += 1 counter += 1
@ -252,7 +254,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:
@ -260,8 +262,8 @@ def crawlForum(driver):
try: try:
bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul') bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul')
next = bar.find_element_by_class_name('next')
link = next.find_element_by_tag_name('a').get_attribute('href')
next = bar.find_element(by=By.CLASS_NAME, value='next')
link = next.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
count += 1 count += 1
@ -273,7 +275,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
print("Crawling the BestCardingWorld forum done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 26
- 24
Forums/BestCardingWorld/parser.py View File

@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup):
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results # Sending the results
@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page #return: 'row' that contains a variety of lists that each hold info on the listing page
def bestcardingworld_listing_parser(soup): def bestcardingworld_listing_parser(soup):
nm = 0 # this variable should receive the number of topics
topic = [] # 1 all topics
board = "-1" # 2 board name (the previous level of the topic in the Forum categorization tree.
nm = 0 # *this variable should receive the number of topics
forum = "BestCardingWorld" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
view = [] # 3 number of views of each topic
post = [] # 4 number of posts of each topic
user = [] # 5 all users of each topic
addDate = [] # 6 when the topic was created (difficult to find)
href = [] # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one) # Finding the board (should be just one)
@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup):
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"}) itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"})
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"}) replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"})
views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
# Counting how many topics we have found so far
nm = len(itopics)
index = 0 index = 0
for itopic in itopics: for itopic in itopics:
@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup):
topics = itopic.find('a', {"class": "topictitle"}).text topics = itopic.find('a', {"class": "topictitle"}).text
topic.append(cleanString(topics)) topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls # Adding the url to the list of urls
link = itopic.find('a', {"class": "topictitle"}).get('href') link = itopic.find('a', {"class": "topictitle"}).get('href')
link = cleanLink(link) link = cleanLink(link)
@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup):
# Finding the author of the topic # Finding the author of the topic
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text
author = ps.strip()
user.append(cleanString(author))
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies # Finding the number of replies
posts = replies[index].text.split()[0]
posts = posts.strip()
post.append(cleanString(posts))
post = replies[index].text.split()[0]
post = post.strip()
posts.append(cleanString(post))
# Finding the number of Views # Finding the number of Views
tview = views[index].text.split()[0]
tview = view[index].text.split()[0]
tview = tview.strip() tview = tview.strip()
view.append(cleanString(tview))
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable # If no information about when the topic was added, just assign "-1" to the variable
#CryptBB doesn't show when topic was first posted on listing page #CryptBB doesn't show when topic was first posted on listing page
@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup):
addDate.append(date_time_obj) addDate.append(date_time_obj)
#addDate.append("-1") #addDate.append("-1")
index += 1 index += 1
return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
#called by the crawler to get description links on a listing page #called by the crawler to get description links on a listing page


+ 14
- 12
Forums/Cardingleaks/crawler_selenium.py View File

@ -34,15 +34,15 @@ baseURL = 'https://leaks.ws/'
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, True) new_parse(forumName, baseURL, True)
@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -159,7 +161,7 @@ def getAccess():
# Saves the crawled html page # Saves the crawled html page
def savePage(page, url):
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page) cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url) filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True) os.makedirs(os.path.dirname(filePath), exist_ok=True)
@ -242,7 +244,7 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver.page_source, topic + f"page{counter}") # very important
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out # comment out
if counter == 2: if counter == 2:
@ -261,7 +263,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


+ 12
- 10
Forums/CryptBB/crawler_selenium.py View File

@ -30,15 +30,15 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, True) new_parse(forumName, baseURL, True)
@ -162,6 +162,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -289,7 +291,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


+ 1
- 1
Forums/CryptBB/parser.py View File

@ -124,7 +124,7 @@ def cryptBB_description_parser(soup):
stime = dt.replace('Yesterday,','').strip() stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
elif "hour ago" in dt or "hours ago" in dt:
day = day.strftime('%m-%d-%Y') day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')


+ 12
- 10
Forums/HiddenAnswers/crawler_selenium.py View File

@ -32,15 +32,15 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver: webdriver.Firefox = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver: webdriver.Firefox = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, True) new_parse(forumName, baseURL, True)
@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
def getAccess(): def getAccess():
@ -235,7 +237,7 @@ def crawlForum(driver: webdriver.Firefox):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


+ 5
- 2
Forums/HiddenAnswers/parser.py View File

@ -127,15 +127,18 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
if date_posted.find("day") > 0: if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1) datetime_obj = datetime.now() - timedelta(days=1)
else: else:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
try:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
except ValueError:
datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
addDate.append(datetime_obj) addDate.append(datetime_obj)
#this link will be cleaned #this link will be cleaned
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href") listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
href.append(listing_href) href.append(listing_href)
#need to change this method
nm = len(topic) nm = len(topic)
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate) return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
#need to change this method #need to change this method


+ 1
- 9
Forums/Initialization/forumsList.txt View File

@ -1,9 +1 @@
AbyssForum
Altenens
BestCardingWorld
Cardingleaks
CryptBB
HiddenAnswers
Libre
OnniForums
Procrax
BestCardingWorld

+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
from Forums.Altenens.parser import * from Forums.Altenens.parser import *
from Forums.Procrax.parser import * from Forums.Procrax.parser import *
from Forums.Libre.parser import * from Forums.Libre.parser import *
from Forums.HiddenAnswers.parser import *
from Forums.Classifier.classify_product import predict from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -126,6 +127,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = procrax_listing_parser(soup) rw = procrax_listing_parser(soup)
elif forum == "Libre": elif forum == "Libre":
rw = libre_listing_parser(soup) rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
else: else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception
@ -160,6 +163,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = procrax_description_parser(soup) rmm = procrax_description_parser(soup)
elif forum == "Libre": elif forum == "Libre":
rmm = libre_description_parser(soup) rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
else: else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception


+ 13
- 11
Forums/Libre/crawler_selenium.py View File

@ -30,15 +30,15 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, True) new_parse(forumName, baseURL, True)
@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -255,7 +257,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:
@ -275,7 +277,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling the Libre forum done.")
print("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website # Returns 'True' if the link is Topic link, may need to change for every website


+ 12
- 10
Forums/OnniForums/crawler_selenium.py View File

@ -33,15 +33,15 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
forumName = getForumName() forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forum=forumName, url=baseURL, createLog=True) new_parse(forum=forumName, url=baseURL, createLog=True)
@ -139,6 +139,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -267,7 +269,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


+ 8
- 7
Forums/OnniForums/parser.py View File

@ -139,12 +139,14 @@ def onniForums_listing_parser(soup: BeautifulSoup):
nm = len(thread_arrays) nm = len(thread_arrays)
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
try:
post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic
body = thread.find("span",{"class": "subject_new"})
try:
post_subject: str = body.text #getting the topic
except AttributeError: except AttributeError:
post_subject: str = thread.find("span",{"class": "subject_old"}).text
body = thread.find("span",{"class": "subject_old"})
post_subject: str = body.text
post_subject_cleaned = cleanString(post_subject.strip()) post_subject_cleaned = cleanString(post_subject.strip())
topic.append(post_subject_cleaned) topic.append(post_subject_cleaned)
@ -163,9 +165,8 @@ def onniForums_listing_parser(soup: BeautifulSoup):
author = thread.find("span",{"class" : "author smalltext"}).text author = thread.find("span",{"class" : "author smalltext"}).text
author_cleaned = cleanString(author.strip()) author_cleaned = cleanString(author.strip())
user.append(author_cleaned) user.append(author_cleaned)
reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
thread_link = reply_anchor.get('href')
thread_link = body.find('a').get('href')
href.append(thread_link) href.append(thread_link)
return organizeTopics( return organizeTopics(


+ 12
- 10
Forums/Procrax/crawler_selenium.py View File

@ -33,15 +33,15 @@ FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
# opentor() # opentor()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse( new_parse(
forum=FORUM_NAME, forum=FORUM_NAME,
@ -140,6 +140,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
def getAccess(): def getAccess():
@ -257,7 +259,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


Loading…
Cancel
Save