finished onni

2 years ago · 8fec2e140b
--- a/Forums/CrackingPro/crawler_mechanize.py
+++ b/Forums/CrackingPro/crawler_mechanize.py
@ -175,7 +175,7 @@ def crawlForum(br):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            page = br.open(link)
            page = br.open(link)#open url
            savePage(page, link)
            res = br.response().read()
@ -198,7 +198,7 @@ def crawlForum(br):
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling Cracking Forum forum done sucessfully. Press ENTER to continue\n")
    input("Crawling CrackingPro forum done sucessfully. Press ENTER to continue\n")
    return
--- a/Forums/CrackingPro/crawler_selenium.py
+++ b/Forums/CrackingPro/crawler_selenium.py
@ -24,7 +24,7 @@ from Forums.OnniForums.parser import cryptBB_links_parser
 from Forums.Utilities.utilities import cleanHTML
 counter = 1
 baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
 baseURL = 'https://www.crackingpro.com/'
 # Opens Tor Browser, crawls the website
@ -58,36 +58,40 @@ def opentor():
 # Login using premade account credentials and do login captcha manually
 def login(driver):
    #click login button
    login_link = driver.find_element(
                by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a').\
                by=By.ID, value='elUserSignIn').\
                get_attribute('href')
    driver.get(login_link)
    #entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
    usernameBox = driver.find_element(by=By.ID, value='auth')
    #Username here
    usernameBox.send_keys('purely_cabbage')
    passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
    usernameBox.send_keys('cheese_pizza_man')
    passwordBox = driver.find_element(by=By.ID, value='password')
    #Password here
    passwordBox.send_keys('$ourP@tchK1ds')
    passwordBox.send_keys('Gr33nSp@m&3ggs')
    input("Press ENTER when log in is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="content"]')))
        (By.XPATH, '/html/body/main/div/div/div[1]/section/ol/li[8]')))
 # Returns the name of the website
 def getForumName():
    name = 'OnniForums'
    name = 'CrackingPro'
    return name
 # Return the link of the website
 def getFixedURL():
    url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
    url = 'https://www.crackingpro.com/'
    return url
@ -97,7 +101,7 @@ def closetor(driver):
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    driver.close()# close the current tab
    time.sleep(3)
    return
@ -143,10 +147,10 @@ def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        driver.get(url)# open given url
        return driver
    except:
        driver.close()
        driver.close()#close the current tab
        return 'down'
@ -164,11 +168,11 @@ def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        #..\CryptBB\HTML_Pages\\
        fullPath = r'..\OnniForums\HTML_Pages\\' + str(
        fullPath = r'..\CrackingPro\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'..\OnniForums\HTML_Pages\\' + str(
        fullPath = r'..\CrackingPro\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath
@ -187,8 +191,8 @@ def getNameFromURL(url):
 def getInterestedLinks():
    links = []
    # Hacking & Cracking tutorials
    links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
    # exploiting tutorials
    links.append('https://www.crackingpro.com/forum/38-exploiting-tutorials/')
    # Hacking & Cracking questions
    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
    # Exploit PoCs
@ -212,7 +216,7 @@ def getInterestedLinks():
 def crawlForum(driver):
    print("Crawling the OnniForums forum")
    print("Crawling the CrackingPro forum")
    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
@ -233,7 +237,7 @@ def crawlForum(driver):
            has_next_page = True
            while has_next_page:
                list = topicPages(html)
                list = topicPages(html)#parses?
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
@ -247,13 +251,13 @@ def crawlForum(driver):
                # comment out
                #if count == 1:
                    #count = 0
                    #break
                #    count = 0
                #    break
                try:
                    temp = driver.find_element(by=By.XPATH, value=
                        '/html/body/div/div[2]/div/div[2]/div')
                    link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
                        '/html/body/main/div/div/div/div[4]/div/div[1]/div/ul/')
                    link = temp.find_element(by=By.CLASS_NAME, value='ipsPagination_next').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
@ -275,19 +279,19 @@ def crawlForum(driver):
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n")
    input("Crawling CrackingPro forum done sucessfully. Press ENTER to continue\n")
 # Returns 'True' if the link is Topic link
 def isDescriptionLink(url):
    if 'Thread' in url:
    if 'topic' in url:
        return True
    return False
 # Returns True if the link is a listingPage link
 def isListingLink(url):
    if 'Forum' in url:
    if 'forum' in url:
        return True
    return False
--- a/Forums/CryptBB/crawler_mechanize.py
+++ b/Forums/CryptBB/crawler_mechanize.py
@ -1,7 +1,7 @@
 __author__ = '91Shadows'
 '''
 BestCardingWorld Crawler (Mechanize)
 CryptBB Crawler (Mechanize)
 '''
 import codecs, os, re
@ -203,14 +203,14 @@ def crawlForum(br):
    return
 # Returns True if the link is 'Topic' Links
 # Returns True if the link is 'Topic' Links, may need to change for diff websites
 def isDescriptionLink(url):
    if 'topic' in url:
        return True
    return False
 # Returns True if the link is a listingPage link
 # Returns True if the link is a listingPage link, may need to change for diff websites
 def isListingLink(url):
    '''
    reg = 'board=[0-9]+.[0-9]+\Z'
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@ -62,15 +62,15 @@ def login(driver):
    login_link = driver.find_element(
                by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\
                get_attribute('href')
    driver.get(login_link)
    driver.get(login_link)# open tab with url
    #entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
    #Username here
    usernameBox.send_keys('holyre')
    usernameBox.send_keys('holyre')#sends string to the username box
    passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
    #Password here
    passwordBox.send_keys('PlatinumBorn2')
    passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox
    '''
    # wait for captcha page show up
@ -101,6 +101,7 @@ def login(driver):
    input("Press ENTER when CAPTCHA is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)
    # wait for 50 sec until id = tab_content is found, then cont
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="tab_content"]')))
@ -123,7 +124,7 @@ def closetor(driver):
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    driver.close() #close tab
    time.sleep(3)
    return
@ -169,10 +170,10 @@ def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        driver.get(url)# open url in browser
        return driver
    except:
        driver.close()
        driver.close()# close tab
        return 'down'
@ -188,7 +189,7 @@ def savePage(page, url):
 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):#..\CryptBB\HTML_Pages\\
    if isDescriptionLink(url):
        fullPath = r'..\\CryptBB\\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
@ -250,7 +251,7 @@ def crawlForum(driver):
        print('Crawling :', link)
        try:
            try:
                driver.get(link)
                driver.get(link)# open
            except:
                driver.refresh()
            html = driver.page_source
@ -258,7 +259,7 @@ def crawlForum(driver):
            has_next_page = True
            while has_next_page:
                list = topicPages(html)
                list = topicPages(html)#parses?
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
@ -275,7 +276,7 @@ def crawlForum(driver):
                    count = 0
                    break
                try:
                try:# change depending on web page
                    temp = driver.find_element(by=By.XPATH, value=
                        '/html/body/div/div[2]/div/div[2]/div')
                    link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
@ -300,17 +301,17 @@ def crawlForum(driver):
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling CryptBB forum done sucessfully. Press ENTER to continue\n")
    input("Crawling CryptBB forum done successfully. Press ENTER to continue\n")
 # Returns 'True' if the link is Topic link
 # Returns 'True' if the link is Topic link, may need to change for every website
 def isDescriptionLink(url):
    if 'thread' in url:
        return True
    return False
 # Returns True if the link is a listingPage link
 # Returns True if the link is a listingPage link, may need to change for every website
 def isListingLink(url):
    if 'forum' in url:
        return True
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@ -9,6 +9,7 @@ from datetime import *
 from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
 from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
 from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
 #from Forums.CrackingPro.crawler_selenium import crawler as crawlerCrackingPro
 import time
@ -99,6 +100,8 @@ if __name__ == '__main__':
            crawlerCryptBB()
        elif forum == "OnniForums":
            crawlerOnniForums()
        elif forum == "CrackingPro":
            crawlerCrackingPro()
    print("Scraping process completed successfully!")
--- a/Forums/Initialization/geckodriver.log
+++ b/Forums/Initialization/geckodriver.log
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@ -20,7 +20,7 @@ from datetime import date
 import subprocess
 from bs4 import BeautifulSoup
 from Forums.Initialization.prepare_parser import new_parse
 from Forums.OnniForums.parser import cryptBB_links_parser
 from Forums.OnniForums.parser import onniForums_links_parser
 from Forums.Utilities.utilities import cleanHTML
 counter = 1
@ -242,17 +242,17 @@ def crawlForum(driver):
                        driver.refresh()
                    savePage(driver.page_source, item)
                    driver.back()
                    # comment out
                    break
                    # comment out, one topic per page
                    # break
                # comment out
                # comment out, go through all pages
                #if count == 1:
                    #count = 0
                    #break
                 #   count = 0
                  #  break
                try:
                    temp = driver.find_element(by=By.XPATH, value=
                        '/html/body/div/div[2]/div/div[2]/div')
                        '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
                    link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
                    if link == "":
@ -296,7 +296,7 @@ def isListingLink(url):
 def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
    return cryptBB_links_parser(soup)
    return onniForums_links_parser(soup)
 def crawler():
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@ -334,7 +334,8 @@ def onniForums_listing_parser(soup):
    #return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href)
 def cryptBB_links_parser(soup):
 #need to change this method
 def onniForums_links_parser(soup):
    # Returning all links that should be visited by the Crawler