Browse Source

cleaned terminal output

main
westernmeadow 1 year ago
parent
commit
2022abecc8
14 changed files with 157 additions and 126 deletions
  1. +13
    -13
      Forums/AbyssForum/crawler_selenium.py
  2. +11
    -11
      Forums/Altenens/crawler_selenium.py
  3. +3
    -4
      Forums/Altenens/parser.py
  4. +1
    -1
      Forums/BestCardingWorld/crawler_selenium.py
  5. +15
    -15
      Forums/Cardingleaks/crawler_selenium.py
  6. +15
    -17
      Forums/CryptBB/crawler_selenium.py
  7. +18
    -18
      Forums/HiddenAnswers/crawler_selenium.py
  8. +9
    -1
      Forums/Initialization/forumsList.txt
  9. +19
    -6
      Forums/Initialization/forums_mining.py
  10. +25
    -8
      Forums/Initialization/prepare_parser.py
  11. +3
    -3
      Forums/Libre/crawler_selenium.py
  12. +2
    -6
      Forums/Libre/parser.py
  13. +13
    -13
      Forums/OnniForums/crawler_selenium.py
  14. +10
    -10
      Forums/Procrax/crawler_selenium.py

+ 13
- 13
Forums/AbyssForum/crawler_selenium.py View File

@ -30,19 +30,19 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
@ -260,7 +260,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n")
print("Crawling the AbyssForum forum done.")
# Returns 'True' if the link is Topic link


+ 11
- 11
Forums/Altenens/crawler_selenium.py View File

@ -30,19 +30,19 @@ baseURL = 'https://altenens.is/'
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# opentor()
forumName = getForumName()
driver = getAccess()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser


+ 3
- 4
Forums/Altenens/parser.py View File

@ -9,9 +9,8 @@ import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def altenens_description_parser(soup):
topic = "-1" # 0 *topic name
@ -58,7 +57,7 @@ def altenens_description_parser(soup):
if inner is not None:
inner = inner.strip()
else:
inner = "-1"
inner = "" # cannot use -1 because the post is hidden unless you reply
post.append(cleanString(inner))
feedback.append("-1")
@ -75,8 +74,8 @@ def altenens_description_parser(soup):
return row
# This is the method to parse the Listing Pages (one page with many posts)
# This is the method to parse the Listing Pages (one page with many posts)
def altenens_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics


+ 1
- 1
Forums/BestCardingWorld/crawler_selenium.py View File

@ -38,7 +38,7 @@ def startCrawling():
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser


+ 15
- 15
Forums/Cardingleaks/crawler_selenium.py View File

@ -4,7 +4,7 @@ __author__ = 'DarkWeb'
Cardingleaks Forum Crawler (Selenium)
Crawler updated and fixed
The site has this thing sometime whereyou'll have to look at a new post everyday. makes sure
The site has this thing sometime where you'll have to look at a new post everyday. makes sure
you login first before crawling.
'''
@ -27,7 +27,7 @@ from Forums.Cardingleaks.parser import cardingleaks_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://cardingleaks.ws/'
baseURL = 'https://leaks.ws/'
# Opens Tor Browser, crawls the website
@ -35,7 +35,7 @@ def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
@ -44,7 +44,7 @@ def startCrawling():
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
@ -96,7 +96,7 @@ def getForumName() -> str:
# Return the link of the website
def getFixedURL():
url = 'https://cardingleaks.ws/'
url = 'https://leaks.ws/'
return url
@ -194,23 +194,23 @@ def getInterestedLinks():
links = []
# # carding methods
links.append('https://cardingleaks.ws/forums/carding-methods.82/')
links.append('https://leaks.ws/forums/carding-methods.82/')
# # carding schools
# links.append('https://cardingleaks.ws/forums/help-desk-carding-school.35/')
# links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# # carding discussion
# links.append('https://cardingleaks.ws/forums/carding-discussion-desk.58/')
# links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# # carding tutorials
# links.append('https://cardingleaks.ws/forums/carding-tutorials.13/')
# links.append('https://leaks.ws/forums/carding-tutorials.13/')
# # carding tools and software
# links.append('https://cardingleaks.ws/forums/carding-tools-softwares.10/')
# links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# # exploits and cracking tools
# links.append('https://cardingleaks.ws/forums/exploits-cracking-tools.22/')
# links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
return links
def crawlForum(driver):
print("Crawling the Cardinglinks forum")
print("Crawling the Cardingleaks forum")
linksToCrawl = getInterestedLinks()
@ -245,7 +245,7 @@ def crawlForum(driver):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 5:
if counter == 2:
break
try:
@ -264,7 +264,7 @@ def crawlForum(driver):
break
# comment out
if count == 10:
if count == 1:
break
try:
@ -280,7 +280,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n")
print("Crawling the Cardingleaks forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 15
- 17
Forums/CryptBB/crawler_selenium.py View File

@ -28,19 +28,19 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# opentor()
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
@ -238,8 +238,6 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the CryptBB forum")
print("Crawling the CryptBB forum")
linksToCrawl = getInterestedLinks()
i = 0
@ -273,7 +271,7 @@ def crawlForum(driver):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 10:
if counter == 2:
break
try:
@ -291,10 +289,10 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 20:
if count == 1:
break
try:
@ -312,7 +310,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling CrypttBB done successfully. Press ENTER to continue\n")
print("Crawling the CrypttBB forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 18
- 18
Forums/HiddenAnswers/crawler_selenium.py View File

@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver: webdriver.Firefox = getAccess()
# opentor()
forumName = getForumName()
# driver: webdriver.Firefox = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
@ -219,8 +219,8 @@ def crawlForum(driver: webdriver.Firefox):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
# if counter == 2:
# break
if counter == 2:
break
try:
page = "" # no next page so far may have some later on
@ -235,11 +235,11 @@ def crawlForum(driver: webdriver.Firefox):
driver.back()
# comment out
# break
break
# comment out
# if count == 1:
# break
if count == 1:
break
try:
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
@ -255,7 +255,7 @@ def crawlForum(driver: webdriver.Firefox):
print(link, e)
i += 1
input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n")
print("Crawling the HiddenAnswers forum done.")
# Returns 'True' if the link is Topic link


+ 9
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1,9 @@
HiddenAnswers
AbyssForum
Altenens
BestCardingWorld
Cardingleaks
CryptBB
HiddenAnswers
Libre
OnniForums
Procrax

+ 19
- 6
Forums/Initialization/forums_mining.py View File

@ -4,7 +4,6 @@ __author__ = 'DarkWeb'
Starting point of the Darkweb Forums Mining
'''
import os
from datetime import *
from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
@ -17,7 +16,8 @@ from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
from Forums.Libre.crawler_selenium import crawler as crawlerLibre
import configparser
import time
import os
import subprocess
config = configparser.ConfigParser()
config.read('../../setup.ini')
@ -88,9 +88,22 @@ def createSubdirectories(pagesDir):
os.mkdir(descReadDir)
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
# time.sleep(7.5)
input('Press ENTER when Tor is connected to continue')
return
# main method
if __name__ == '__main__':
# opentor()
# assignment from forumsList.txt
forumsList = getForums()
@ -98,10 +111,10 @@ if __name__ == '__main__':
for forum in forumsList:
forum = forum.replace('\n','')
print("Creating listing and description directories ... for " + forum)
print("\nCreating listing and description directories ... for " + forum)
createDirectory(forum)
time.sleep(5) # wait for directories to be created
print("Directories created successfully.")
# time.sleep(5) # wait for directories to be created
print("Directories created.")
if forum == "BestCardingWorld":
crawlerBestCardingWorld()
@ -122,7 +135,7 @@ if __name__ == '__main__':
elif forum == 'Libre':
crawlerLibre()
print("Scraping process completed successfully!")
print("Scraping process completed!")


+ 25
- 8
Forums/Initialization/prepare_parser.py View File

@ -73,13 +73,11 @@ def persist_data(url, row, cur):
forum = create_forum(cur, row, url)
board = create_board(cur, row, forum)
author = create_author(cur, row, forum)
author = create_user(cur, row, forum, 0)
topic = create_topic(cur, forum, row, author)
topic = create_topic(cur, row, forum, board, author)
create_posts(cur, row, forum, board, topic)
create_posts(cur, row, forum, topic)
def incrementError():
@ -191,8 +189,9 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
if trace.find("already exists") == -1:
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
traceback.print_exc()
if createLog:
logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
return False
else:
return True
@ -223,7 +222,7 @@ def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.")
# Connecting to the database
con = connectDataBase()
@ -261,6 +260,7 @@ def new_parse(forum, url, createLog):
parseDescriptionError = False
persistDescriptionError = False
moveDescriptionError = False
findDescriptionError = False
rw = []
@ -272,6 +272,8 @@ def new_parse(forum, url, createLog):
if doDescription:
nFound = 0
for rec in rw:
rec = rec.split(',')
@ -280,6 +282,9 @@ def new_parse(forum, url, createLog):
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
nFound += len(descriptions)
for descriptionIndex, descriptionFile in enumerate(descriptions):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(
@ -331,7 +336,19 @@ def new_parse(forum, url, createLog):
else:
moveDescriptionError = True
if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError):
if not (nFound > 0):
findDescriptionError = True
incrementError()
print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!")
if createLog:
logFile.write(
str(nError) + f". There was a problem to locate the file(s) for {listingFile}"
f" in the Description section!\n")
if not (readDescriptionError or parseDescriptionError or persistDescriptionError
or moveDescriptionError or findDescriptionError):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)


+ 3
- 3
Forums/Libre/crawler_selenium.py View File

@ -31,7 +31,7 @@ def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
@ -40,7 +40,7 @@ def startCrawling():
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, baseURL, False)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
@ -275,7 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling Libre done successfully. Press ENTER to continue\n")
input("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 2
- 6
Forums/Libre/parser.py View File

@ -9,11 +9,8 @@ import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def libre_description_parser(soup: Tag):
# Fields to be parsed
@ -88,7 +85,6 @@ def libre_description_parser(soup: Tag):
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
print(date_time_cleaned)
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
@ -130,7 +126,6 @@ def libre_description_parser(soup: Tag):
# This is the method to parse the Listing Pages (one page with many posts)
def libre_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "Libre" # 0 *forum name
@ -217,6 +212,7 @@ def libre_listing_parser(soup):
addDate=addDate
)
def libre_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []


+ 13
- 13
Forums/OnniForums/crawler_selenium.py View File

@ -31,19 +31,19 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# opentor()
forumName = getForumName()
driver = getAccess()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forum=forumName, url=baseURL, createLog=False)
new_parse(forum=forumName, url=baseURL, createLog=True)
# Opens Tor Browser
@ -214,7 +214,7 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the OnniForums")
print("Crawling the OnniForums forum")
linksToCrawl = getInterestedLinks()
@ -288,7 +288,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling OnniForums done successfully. Press ENTER to continue\n")
print("Crawling the OnniForums forum done.")
# Returns 'True' if the link is Topic link


+ 10
- 10
Forums/Procrax/crawler_selenium.py View File

@ -34,7 +34,7 @@ FORUM_NAME = 'Procrax'
def startCrawling():
# opentor()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
@ -46,7 +46,7 @@ def startCrawling():
new_parse(
forum=FORUM_NAME,
url=BASE_URL,
createLog=False
createLog=True
)
@ -190,9 +190,9 @@ def getInterestedLinks():
# # general hacking
links.append('https://procrax.cx/forums/general-hacking.24/')
# # hacking security tools
links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# # hacktube
links.append('https://procrax.cx/forums/hacktube.22/')
# links.append('https://procrax.cx/forums/hacktube.22/')
# # cardable
# links.append('https://procrax.cx/forums/cardable-websites.28/')
# # tools
@ -205,7 +205,7 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the Procrax")
print("Crawling the Procrax forum")
linksToCrawl = getInterestedLinks()
@ -240,8 +240,8 @@ def crawlForum(driver):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
# if counter == 2:
# break
if counter == 2:
break
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
@ -257,10 +257,10 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 20:
if count == 1:
break
try:
@ -278,7 +278,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling Procrax done successfully. Press ENTER to continue\n")
print("Crawling the Procrax forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


Loading…
Cancel
Save