Browse Source

finished onni

main
Helium 1 year ago
parent
commit
8fec2e140b
8 changed files with 1297 additions and 52 deletions
  1. +2
    -2
      Forums/CrackingPro/crawler_mechanize.py
  2. +29
    -25
      Forums/CrackingPro/crawler_selenium.py
  3. +3
    -3
      Forums/CryptBB/crawler_mechanize.py
  4. +14
    -13
      Forums/CryptBB/crawler_selenium.py
  5. +3
    -0
      Forums/Initialization/forums_mining.py
  6. +1236
    -0
      Forums/Initialization/geckodriver.log
  7. +8
    -8
      Forums/OnniForums/crawler_selenium.py
  8. +2
    -1
      Forums/OnniForums/parser.py

+ 2
- 2
Forums/CrackingPro/crawler_mechanize.py View File

@ -175,7 +175,7 @@ def crawlForum(br):
link = linksToCrawl[i] link = linksToCrawl[i]
print('Crawling :', link) print('Crawling :', link)
try: try:
page = br.open(link)
page = br.open(link)#open url
savePage(page, link) savePage(page, link)
res = br.response().read() res = br.response().read()
@ -198,7 +198,7 @@ def crawlForum(br):
# finalTime = time.time() # finalTime = time.time()
# print finalTime - initialTime # print finalTime - initialTime
input("Crawling Cracking Forum forum done sucessfully. Press ENTER to continue\n")
input("Crawling CrackingPro forum done sucessfully. Press ENTER to continue\n")
return return


+ 29
- 25
Forums/CrackingPro/crawler_selenium.py View File

@ -24,7 +24,7 @@ from Forums.OnniForums.parser import cryptBB_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
counter = 1 counter = 1
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
baseURL = 'https://www.crackingpro.com/'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
@ -58,36 +58,40 @@ def opentor():
# Login using premade account credentials and do login captcha manually # Login using premade account credentials and do login captcha manually
def login(driver): def login(driver):
#click login button #click login button
login_link = driver.find_element( login_link = driver.find_element(
by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a').\
by=By.ID, value='elUserSignIn').\
get_attribute('href') get_attribute('href')
driver.get(login_link) driver.get(login_link)
#entering username and password into input boxes #entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
usernameBox = driver.find_element(by=By.ID, value='auth')
#Username here #Username here
usernameBox.send_keys('purely_cabbage')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
usernameBox.send_keys('cheese_pizza_man')
passwordBox = driver.find_element(by=By.ID, value='password')
#Password here #Password here
passwordBox.send_keys('$ourP@tchK1ds')
passwordBox.send_keys('Gr33nSp@m&3ggs')
input("Press ENTER when log in is completed\n") input("Press ENTER when log in is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="content"]')))
(By.XPATH, '/html/body/main/div/div/div[1]/section/ol/li[8]')))
# Returns the name of the website # Returns the name of the website
def getForumName(): def getForumName():
name = 'OnniForums'
name = 'CrackingPro'
return name return name
# Return the link of the website # Return the link of the website
def getFixedURL(): def getFixedURL():
url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
url = 'https://www.crackingpro.com/'
return url return url
@ -97,7 +101,7 @@ def closetor(driver):
# os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe") # os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...') print('Closing Tor...')
driver.close()
driver.close()# close the current tab
time.sleep(3) time.sleep(3)
return return
@ -143,10 +147,10 @@ def getAccess():
url = getFixedURL() url = getFixedURL()
driver = createFFDriver() driver = createFFDriver()
try: try:
driver.get(url)
driver.get(url)# open given url
return driver return driver
except: except:
driver.close()
driver.close()#close the current tab
return 'down' return 'down'
@ -164,11 +168,11 @@ def getFullPathName(url):
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
#..\CryptBB\HTML_Pages\\ #..\CryptBB\HTML_Pages\\
fullPath = r'..\OnniForums\HTML_Pages\\' + str(
fullPath = r'..\CrackingPro\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
else: else:
fullPath = r'..\OnniForums\HTML_Pages\\' + str(
fullPath = r'..\CrackingPro\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
return fullPath return fullPath
@ -187,8 +191,8 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacking & Cracking tutorials
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
# exploiting tutorials
links.append('https://www.crackingpro.com/forum/38-exploiting-tutorials/')
# Hacking & Cracking questions # Hacking & Cracking questions
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
# Exploit PoCs # Exploit PoCs
@ -212,7 +216,7 @@ def getInterestedLinks():
def crawlForum(driver): def crawlForum(driver):
print("Crawling the OnniForums forum")
print("Crawling the CrackingPro forum")
linksToCrawl = getInterestedLinks() linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl) visited = set(linksToCrawl)
@ -233,7 +237,7 @@ def crawlForum(driver):
has_next_page = True has_next_page = True
while has_next_page: while has_next_page:
list = topicPages(html)
list = topicPages(html)#parses?
for item in list: for item in list:
itemURL = urlparse.urljoin(baseURL, str(item)) itemURL = urlparse.urljoin(baseURL, str(item))
try: try:
@ -247,13 +251,13 @@ def crawlForum(driver):
# comment out # comment out
#if count == 1: #if count == 1:
#count = 0
#break
# count = 0
# break
try: try:
temp = driver.find_element(by=By.XPATH, value= temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
'/html/body/main/div/div/div/div[4]/div/div[1]/div/ul/')
link = temp.find_element(by=By.CLASS_NAME, value='ipsPagination_next').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
@ -275,19 +279,19 @@ def crawlForum(driver):
# finalTime = time.time() # finalTime = time.time()
# print finalTime - initialTime # print finalTime - initialTime
input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n")
input("Crawling CrackingPro forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link
def isDescriptionLink(url): def isDescriptionLink(url):
if 'Thread' in url:
if 'topic' in url:
return True return True
return False return False
# Returns True if the link is a listingPage link # Returns True if the link is a listingPage link
def isListingLink(url): def isListingLink(url):
if 'Forum' in url:
if 'forum' in url:
return True return True
return False return False


+ 3
- 3
Forums/CryptBB/crawler_mechanize.py View File

@ -1,7 +1,7 @@
__author__ = '91Shadows' __author__ = '91Shadows'
''' '''
BestCardingWorld Crawler (Mechanize)
CryptBB Crawler (Mechanize)
''' '''
import codecs, os, re import codecs, os, re
@ -203,14 +203,14 @@ def crawlForum(br):
return return
# Returns True if the link is 'Topic' Links
# Returns True if the link is 'Topic' Links, may need to change for diff websites
def isDescriptionLink(url): def isDescriptionLink(url):
if 'topic' in url: if 'topic' in url:
return True return True
return False return False
# Returns True if the link is a listingPage link
# Returns True if the link is a listingPage link, may need to change for diff websites
def isListingLink(url): def isListingLink(url):
''' '''
reg = 'board=[0-9]+.[0-9]+\Z' reg = 'board=[0-9]+.[0-9]+\Z'


+ 14
- 13
Forums/CryptBB/crawler_selenium.py View File

@ -62,15 +62,15 @@ def login(driver):
login_link = driver.find_element( login_link = driver.find_element(
by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\ by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\
get_attribute('href') get_attribute('href')
driver.get(login_link)
driver.get(login_link)# open tab with url
#entering username and password into input boxes #entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
#Username here #Username here
usernameBox.send_keys('holyre')
usernameBox.send_keys('holyre')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
#Password here #Password here
passwordBox.send_keys('PlatinumBorn2')
passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox
''' '''
# wait for captcha page show up # wait for captcha page show up
@ -101,6 +101,7 @@ def login(driver):
input("Press ENTER when CAPTCHA is completed\n") input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="tab_content"]'))) (By.XPATH, '//*[@id="tab_content"]')))
@ -123,7 +124,7 @@ def closetor(driver):
# os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe") # os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...') print('Closing Tor...')
driver.close()
driver.close() #close tab
time.sleep(3) time.sleep(3)
return return
@ -169,10 +170,10 @@ def getAccess():
url = getFixedURL() url = getFixedURL()
driver = createFFDriver() driver = createFFDriver()
try: try:
driver.get(url)
driver.get(url)# open url in browser
return driver return driver
except: except:
driver.close()
driver.close()# close tab
return 'down' return 'down'
@ -188,7 +189,7 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url):#..\CryptBB\HTML_Pages\\
if isDescriptionLink(url):
fullPath = r'..\\CryptBB\\HTML_Pages\\' + str( fullPath = r'..\\CryptBB\\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
@ -250,7 +251,7 @@ def crawlForum(driver):
print('Crawling :', link) print('Crawling :', link)
try: try:
try: try:
driver.get(link)
driver.get(link)# open
except: except:
driver.refresh() driver.refresh()
html = driver.page_source html = driver.page_source
@ -258,7 +259,7 @@ def crawlForum(driver):
has_next_page = True has_next_page = True
while has_next_page: while has_next_page:
list = topicPages(html)
list = topicPages(html)#parses?
for item in list: for item in list:
itemURL = urlparse.urljoin(baseURL, str(item)) itemURL = urlparse.urljoin(baseURL, str(item))
try: try:
@ -275,7 +276,7 @@ def crawlForum(driver):
count = 0 count = 0
break break
try:
try:# change depending on web page
temp = driver.find_element(by=By.XPATH, value= temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[2]/div') '/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
@ -300,17 +301,17 @@ def crawlForum(driver):
# finalTime = time.time() # finalTime = time.time()
# print finalTime - initialTime # print finalTime - initialTime
input("Crawling CryptBB forum done sucessfully. Press ENTER to continue\n")
input("Crawling CryptBB forum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url): def isDescriptionLink(url):
if 'thread' in url: if 'thread' in url:
return True return True
return False return False
# Returns True if the link is a listingPage link
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'forum' in url: if 'forum' in url:
return True return True


+ 3
- 0
Forums/Initialization/forums_mining.py View File

@ -9,6 +9,7 @@ from datetime import *
from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
#from Forums.CrackingPro.crawler_selenium import crawler as crawlerCrackingPro
import time import time
@ -99,6 +100,8 @@ if __name__ == '__main__':
crawlerCryptBB() crawlerCryptBB()
elif forum == "OnniForums": elif forum == "OnniForums":
crawlerOnniForums() crawlerOnniForums()
elif forum == "CrackingPro":
crawlerCrackingPro()
print("Scraping process completed successfully!") print("Scraping process completed successfully!")


+ 1236
- 0
Forums/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


+ 8
- 8
Forums/OnniForums/crawler_selenium.py View File

@ -20,7 +20,7 @@ from datetime import date
import subprocess import subprocess
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse from Forums.Initialization.prepare_parser import new_parse
from Forums.OnniForums.parser import cryptBB_links_parser
from Forums.OnniForums.parser import onniForums_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
counter = 1 counter = 1
@ -242,17 +242,17 @@ def crawlForum(driver):
driver.refresh() driver.refresh()
savePage(driver.page_source, item) savePage(driver.page_source, item)
driver.back() driver.back()
# comment out
break
# comment out, one topic per page
# break
# comment out
# comment out, go through all pages
#if count == 1: #if count == 1:
#count = 0
#break
# count = 0
# break
try: try:
temp = driver.find_element(by=By.XPATH, value= temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[2]/div')
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "": if link == "":
@ -296,7 +296,7 @@ def isListingLink(url):
def topicPages(html): def topicPages(html):
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return cryptBB_links_parser(soup)
return onniForums_links_parser(soup)
def crawler(): def crawler():


+ 2
- 1
Forums/OnniForums/parser.py View File

@ -334,7 +334,8 @@ def onniForums_listing_parser(soup):
#return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href) #return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href)
def cryptBB_links_parser(soup):
#need to change this method
def onniForums_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler


Loading…
Cancel
Save