diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 6363711..ba22e8a 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -22,7 +22,6 @@
-
diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py
index 129e6dc..4823db2 100644
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/AbyssForum/crawler_selenium.py
@@ -32,15 +32,15 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -241,14 +243,14 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
break
try:
- link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
+ link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py
index 0f14223..e996150 100644
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@@ -42,7 +42,7 @@ def startCrawling():
print(driver.current_url, e)
closetor(driver)
- # new_parse(forumName, baseURL, True)
+ new_parse(forumName, baseURL, True)
# Opens Tor Browser
@@ -118,8 +118,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -136,6 +136,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -253,7 +255,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -272,7 +274,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
+ print("Crawling the Altenens forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py
index 96821cd..22f6077 100644
--- a/Forums/BestCardingWorld/crawler_selenium.py
+++ b/Forums/BestCardingWorld/crawler_selenium.py
@@ -114,6 +114,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -238,8 +240,8 @@ def crawlForum(driver):
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul')
- li = nav.find_element_by_class_name('next')
- page = li.find_element_by_tag_name('a').get_attribute('href')
+ li = nav.find_element(by=By.CLASS_NAME, value='next')
+ page = li.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@@ -252,7 +254,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -260,8 +262,8 @@ def crawlForum(driver):
try:
bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul')
- next = bar.find_element_by_class_name('next')
- link = next.find_element_by_tag_name('a').get_attribute('href')
+ next = bar.find_element(by=By.CLASS_NAME, value='next')
+ link = next.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@@ -273,7 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
+ print("Crawling the BestCardingWorld forum done.")
# Returns 'True' if the link is a description link
diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py
index 540435a..c4ca6e0 100644
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
- row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
+ row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
@@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page
def bestcardingworld_listing_parser(soup):
- nm = 0 # this variable should receive the number of topics
- topic = [] # 1 all topics
- board = "-1" # 2 board name (the previous level of the topic in the Forum categorization tree.
+ nm = 0 # *this variable should receive the number of topics
+ forum = "BestCardingWorld" # 0 *forum name
+ board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- view = [] # 3 number of views of each topic
- post = [] # 4 number of posts of each topic
- user = [] # 5 all users of each topic
- addDate = [] # 6 when the topic was created (difficult to find)
- href = [] # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages)
+ author = [] # 2 *all authors of each topic
+ topic = [] # 3 *all topics
+ views = [] # 4 number of views of each topic
+ posts = [] # 5 number of posts of each topic
+ href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+ # Listing and Description pages)
+ addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
@@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup):
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"})
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"})
- views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
+ view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
+
+ # Counting how many topics we have found so far
+
+ nm = len(itopics)
+
index = 0
for itopic in itopics:
@@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup):
topics = itopic.find('a', {"class": "topictitle"}).text
topic.append(cleanString(topics))
- # Counting how many topics we have found so far
-
- nm = len(topic)
-
# Adding the url to the list of urls
link = itopic.find('a', {"class": "topictitle"}).get('href')
link = cleanLink(link)
@@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup):
# Finding the author of the topic
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text
- author = ps.strip()
- user.append(cleanString(author))
+ user = ps.strip()
+ author.append(cleanString(user))
# Finding the number of replies
- posts = replies[index].text.split()[0]
- posts = posts.strip()
- post.append(cleanString(posts))
+ post = replies[index].text.split()[0]
+ post = post.strip()
+ posts.append(cleanString(post))
# Finding the number of Views
- tview = views[index].text.split()[0]
+ tview = view[index].text.split()[0]
tview = tview.strip()
- view.append(cleanString(tview))
+ views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
#CryptBB doesn't show when topic was first posted on listing page
@@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup):
addDate.append(date_time_obj)
#addDate.append("-1")
-
-
index += 1
- return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href)
+
+ return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
#called by the crawler to get description links on a listing page
diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py
index 85538fd..70200ff 100644
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ b/Forums/Cardingleaks/crawler_selenium.py
@@ -34,15 +34,15 @@ baseURL = 'https://leaks.ws/'
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -159,7 +161,7 @@ def getAccess():
# Saves the crawled html page
-def savePage(page, url):
+def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
@@ -242,7 +244,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
- savePage(driver.page_source, topic + f"page{counter}") # very important
+ savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
@@ -261,7 +263,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py
index 5e98a7d..a462a65 100644
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@@ -30,15 +30,15 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -162,6 +162,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -289,7 +291,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
index 7fbd56d..bcef5f8 100644
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@@ -124,7 +124,7 @@ def cryptBB_description_parser(soup):
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
- elif "hours ago" in dt:
+ elif "hour ago" in dt or "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py
index 6641b81..392c90f 100644
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@@ -32,15 +32,15 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver: webdriver.Firefox = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver: webdriver.Firefox = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -235,7 +237,7 @@ def crawlForum(driver: webdriver.Firefox):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 1a3ee2d..16b56cb 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -127,15 +127,18 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1)
else:
- datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
+ try:
+ datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
+ except ValueError:
+ datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
addDate.append(datetime_obj)
#this link will be cleaned
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
href.append(listing_href)
-#need to change this method
nm = len(topic)
+
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
#need to change this method
diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt
index 3526771..efa9686 100644
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@@ -1,9 +1 @@
-AbyssForum
-Altenens
-BestCardingWorld
-Cardingleaks
-CryptBB
-HiddenAnswers
-Libre
-OnniForums
-Procrax
\ No newline at end of file
+BestCardingWorld
\ No newline at end of file
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 272cb44..1f089e6 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
+from Forums.HiddenAnswers.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@@ -126,6 +127,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
+ elif forum == "HiddenAnswers":
+ rw = HiddenAnswers_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -160,6 +163,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
+ elif forum == "HiddenAnswers":
+ rmm = HiddenAnswers_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py
index d06cd83..1033474 100644
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@@ -30,15 +30,15 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -255,7 +257,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -275,7 +277,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling the Libre forum done.")
+ print("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py
index 58b1313..0888d14 100644
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@@ -33,15 +33,15 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forum=forumName, url=baseURL, createLog=True)
@@ -139,6 +139,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -267,7 +269,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py
index 393e6b3..3854141 100644
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@@ -139,12 +139,14 @@ def onniForums_listing_parser(soup: BeautifulSoup):
nm = len(thread_arrays)
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
-
- try:
- post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic
+
+ body = thread.find("span",{"class": "subject_new"})
+ try:
+ post_subject: str = body.text #getting the topic
except AttributeError:
- post_subject: str = thread.find("span",{"class": "subject_old"}).text
+ body = thread.find("span",{"class": "subject_old"})
+ post_subject: str = body.text
post_subject_cleaned = cleanString(post_subject.strip())
topic.append(post_subject_cleaned)
@@ -163,9 +165,8 @@ def onniForums_listing_parser(soup: BeautifulSoup):
author = thread.find("span",{"class" : "author smalltext"}).text
author_cleaned = cleanString(author.strip())
user.append(author_cleaned)
-
- reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
- thread_link = reply_anchor.get('href')
+
+ thread_link = body.find('a').get('href')
href.append(thread_link)
return organizeTopics(
diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py
index f2ed372..2a8be96 100644
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@@ -33,15 +33,15 @@ FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(
forum=FORUM_NAME,
@@ -140,6 +140,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -257,7 +259,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1: