Browse Source

finished fully running completed AbyssForum and Procrax

main
westernmeadow 1 year ago
parent
commit
a3ebc920bb
4 changed files with 91 additions and 51 deletions
  1. +26
    -23
      Forums/AbyssForum/crawler_selenium.py
  2. +5
    -0
      Forums/Initialization/prepare_parser.py
  3. +43
    -20
      Forums/Procrax/crawler_selenium.py
  4. +17
    -8
      Forums/Procrax/parser.py

+ 26
- 23
Forums/AbyssForum/crawler_selenium.py View File

@ -158,19 +158,19 @@ def getInterestedLinks():
links = []
# Hacked Database
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26')
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26')
# Hire a Hacker
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27')
# # Hacking Tools
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28')
# # Carding Forums
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30')
# # Social Media Hacking
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32')
# # Hacking Tutorials
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12')
# # Cracking Tutorials
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13')
# Hacking Tools
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28')
# Carding Forums
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30')
# Social Media Hacking
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32')
# Hacking Tutorials
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12')
# Cracking Tutorials
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13')
return links
@ -210,13 +210,14 @@ def crawlForum(driver):
driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}")
# comment out
if counter == 2:
break
# # comment out
# if counter == 2:
# break
try:
temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]')
page = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href')
temp = driver.find_element(By.CLASS_NAME, 'pagination')
temp = temp.find_element(by=By.CLASS_NAME, value='next')
page = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@ -228,15 +229,17 @@ def crawlForum(driver):
for j in range(counter):
driver.back()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
temp = driver.find_element(By.CLASS_NAME, 'pagination')
temp = temp.find_element(by=By.CLASS_NAME, value='next')
link = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1


+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -15,6 +15,7 @@ from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
from Forums.HiddenAnswers.parser import *
from Forums.AbyssForum.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -137,6 +138,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
elif forum == "AbyssForum":
rw = abyssForums_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -174,6 +177,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
elif forum == "AbyssForum":
rmm = abyssForums_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 43
- 20
Forums/Procrax/crawler_selenium.py View File

@ -171,19 +171,42 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# # general hacking
# verified sales
links.append('https://procrax.cx/forums/verified-sales-market.10/')
# unverified sales
links.append('https://procrax.cx/forums/unverified-sales-market.12/')
# combos
links.append('https://procrax.cx/forums/bases.79/')
# tools
links.append('https://procrax.cx/forums/tools.81/')
# configs
links.append('https://procrax.cx/forums/configs.82/')
# craxtube
links.append('https://procrax.cx/forums/craxtube.83/')
# general hacking
links.append('https://procrax.cx/forums/general-hacking.24/')
# # hacking security tools
# links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# # hacktube
# links.append('https://procrax.cx/forums/hacktube.22/')
# # cardable
# links.append('https://procrax.cx/forums/cardable-websites.28/')
# # tools
# links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# hacking security tools
links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# hacktube
links.append('https://procrax.cx/forums/hacktube.22/')
# cardingtube
links.append('https://procrax.cx/forums/cardingtube.26/')
# cardable
links.append('https://procrax.cx/forums/cardable-websites.28/')
# spam software
links.append('https://procrax.cx/forums/mailing.72/')
# spam tools
links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# darknet news
links.append('https://procrax.cx/forums/darknet-news-articles.42/')
# links
links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/')
# courses
links.append('https://procrax.cx/forums/courses.59/')
# software
links.append('https://procrax.cx/forums/software.76/')
# general forum
# links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
return links
@ -223,9 +246,9 @@ def crawlForum(driver):
driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# # comment out
# if counter == 2:
# break
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
@ -240,12 +263,12 @@ def crawlForum(driver):
for j in range(counter):
driver.back()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:


+ 17
- 8
Forums/Procrax/parser.py View File

@ -48,8 +48,12 @@ def procrax_description_parser(soup: Tag):
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
status.append(cleanString(user_status.strip()))
user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text
reputation.append(cleanString(user_lvl.strip()))
user_lvl = ipost.find("div", {"class": "afAwardLevel"})
if user_lvl is not None:
user_lvl = user_lvl.text
reputation.append(cleanString(user_lvl.strip()))
else:
reputation.append('-1')
sign.append("-1")
@ -110,7 +114,11 @@ def procrax_listing_parser(soup: Tag):
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"})
if sticky is not None:
threads_list = sticky.find_all("div", {"data-author": True}) + threads_list
nm = len(threads_list)
@ -134,19 +142,20 @@ def procrax_listing_parser(soup: Tag):
author.append(cleanString(thread_author))
thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
thread_views = thread_views.lower().replace("k","000")
views.append(cleanString(thread_views.strip()))
thread_views = thread_views.lower().replace("k", "000")
thread_views = thread_views.lower().replace("m", "000000")
views.append(thread_views.strip())
thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
# All threads contain one topic post and reply posts
thread_total_posts = str(1 + int(thread_replies))
posts.append(thread_total_posts)
thread_total_posts = thread_replies.lower().replace("k", "000")
posts.append(thread_total_posts.strip())
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a').get('href')
thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href')
href.append(thread_link)


Loading…
Cancel
Save