Merge branch 'main' of https://gitlab.com/dw9372422/dw_pipeline_test

1 year ago · 15aa4009e6
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@ -3,7 +3,7 @@ __author__ = 'DarkWeb'
 import psycopg2
 import traceback
 from Forums.Utilities.utilities import *

 from dateutil.relativedelta import relativedelta, FR

 def connectDataBase():

@ -484,21 +484,25 @@ def create_posts(cur, row, forumId, topicId):
                                      'dateinserted_post': row[8],
                                      'postId': postId})

 def create_status(cur, forumId, date, status):
 def create_status(cur, forumId, date, listings, descriptions, status):

    date = datetime.strptime(date, "%m%d%Y")

    # getting last Fridays a reference date
    date_reference = date + relativedelta(weekday=FR(-1))

    # checking if status already exists
    sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
    cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})

    recset = cur.fetchall()
    if recset:
       sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
       recset = {'status': status, 'forum_id': forumId, 'date_inserted': date}
       sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \
             "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
       recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date}
    else:
       sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)"
       recset = [forumId, date, status]
       sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)"
       recset = [forumId, date, listings, descriptions, status, date_reference]

    cur.execute(sql, recset)

@ -514,7 +518,8 @@ def create_database(cur, con):
        sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
        cur.execute(sql)

        sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
        sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \
              "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \
              "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
              "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
        cur.execute(sql)
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -341,10 +341,14 @@ def new_parse(forum, url, createLog):
                # move listing files of completed folder
                move_file(listingFile, createLog, logFile)

    # registering the current forum status (up/down) in the database
    # registering the current forum status (up/down) and the number of scraped pages in the database
    forumId = verifyForum(cur, forum)
    if (forumId > 0):
        create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0')

        readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html'))
        readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html'))

        create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
        con.commit()

    if createLog:
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@ -4,7 +4,7 @@ import psycopg2
 import traceback
 import configparser
 from MarketPlaces.Utilities.utilities import *

 from dateutil.relativedelta import relativedelta, FR

 def connectDataBase():

@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId):

    if newItem:

        # decode_decrypt_image_in_base64(row[20])

        sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
              "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
              "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId):

            recset = cur.fetchall()

            # decode_decrypt_image_in_base64(recset[0][20])
            # decode_decrypt_image_in_base64(recset[0]['image_item'])

            if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
                str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or
@ -401,24 +403,27 @@ def create_items(cur, row, marketId, vendorId):

    return itemId

 def create_status(cur, marketId, date, status):
 def create_status(cur, marketId, date, listings, descriptions, status):

    date = datetime.strptime(date, "%m%d%Y")

    # getting last Fridays a reference date
    date_reference = date + relativedelta(weekday=FR(-1))

    # checking if status already exists
    sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
    cur.execute(sql, {'market_id': marketId, 'date_inserted': date})

    recset = cur.fetchall()
    if recset:
       sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
       recset = {'status': status, 'market_id': marketId, 'date_inserted': date}
       sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \
             "where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
       recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date}
    else:
       sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)"
       recset = [marketId, date, status]
       sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)"
       recset = [marketId, date, listings, descriptions, status, date_reference]

    cur.execute(sql, recset)

 def create_database(cur, con):

    try:
@ -431,7 +436,8 @@ def create_database(cur, con):
        sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)"
        cur.execute(sql)

        sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
        sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \
              "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \
              "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
              "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
        cur.execute(sql)
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@ -216,12 +216,12 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()

                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break
                    # comment out
                    break

                # comment out
                if count == 1:
                    break

                try:
                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
@ -236,7 +236,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

        print("Crawling the DarkBazar market done.")
    print("Crawling the DarkBazar market done.")


 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/MarketPlaces/DarkBazar/parser.py
+++ b/MarketPlaces/DarkBazar/parser.py
@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup):

        # Adding the url to the list of urls
        link = bae[0].get('href')
        link = cleanLink(link)
        href.append(link)

        # Finding the Product
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -15,6 +15,9 @@ from MarketPlaces.M00nkeyMarket.parser import *
 from MarketPlaces.MikesGrandStore.parser import *
 from MarketPlaces.PabloEscobarMarket.parser import *
 from MarketPlaces.CityMarket.parser import *
 from MarketPlaces.DarkBazar.parser import *
 from MarketPlaces.Sonanza.parser import *
 from MarketPlaces.Kingdom.parser import *

 from MarketPlaces.Classifier.classify_product import predict

@ -130,6 +133,12 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
            rw = pabloescobarmarket_listing_parser(soup)
        elif marketPlace == "CityMarket":
            rw = city_listing_parser(soup)
        elif marketPlace == "DarkBazar":
            rw = darkbazar_listing_parser(soup)
        elif marketPlace == "Sonanza":
            rw = sonanza_listing_parser(soup)
        elif marketPlace == "Kingdom":
            rw = kingdom_listing_parser(soup)
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -164,6 +173,12 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
            rmm = pabloescobarmarket_description_parser(soup)
        elif marketPlace == "CityMarket":
            rmm = city_description_parser(soup)
        elif marketPlace == "DarkBazar":
            rmm = darkbazar_description_parser(soup)
        elif marketPlace == "Sonanza":
            rmm = sonanza_description_parser(soup)
        elif marketPlace == "Kingdom":
            rmm = kingdom_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -363,10 +378,14 @@ def new_parse(marketPlace, url, createLog):
                # move listing files of completed folder
                move_file(listingFile, createLog, logFile)

    # registering the current market status (up/down) in the database
    # registering the current forum status (up/down) and the number of scraped pages in the database
    marketId = verifyMarketPlace(cur, marketPlace)
    if (marketId > 0):
        create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')

        readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html'))
        readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html'))

        create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
        con.commit()

    if createLog:
--- a/MarketPlaces/Kingdom/crawler_selenium.py
+++ b/MarketPlaces/Kingdom/crawler_selenium.py
@ -1,4 +1,4 @@
 __author__ = 'DarkWeb'
 __author__ = 'Helium'

 '''
 Kingdom Market Crawler (Selenium)
@ -35,55 +35,27 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    # marketName = getMarketName()
    mktName = getMKTName()
    driver = getAccess()

    if driver != 'down':
        try:
            captcha(driver)
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closeDriver(driver)

    # new_parse(marketName, False)
    new_parse(mktName, baseURL, True)

 # Login using premade account credentials and do login captcha manually
 def login(driver):

 def captcha(driver):
    '''
    # wait for captcha page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div[1]")))

    # save captcha to local
    driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot(
        r'..\Kingdom\captcha1.png')

    # This method will show image in any image viewer
    im = Image.open(r'..\Kingdom\captcha1.png')
    im.show()

    iframes = driver.find_elements(by=By.TAG_NAME, value='iframe')

    # ask user input captcha solution in terminal
    print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)")
    for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']:
        id = input(f"{order}: ")
        iframes[int(id)-1].click()
    '''
    input("Press ENTER when CAPTCHA is completed\n")

    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))


 # Login using premade account credentials and do login captcha manually
 def login(driver):
    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
        (By.XPATH, '//*[@id="login-form"]')))

    # entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')
@ -96,39 +68,17 @@ def login(driver):
    select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))
    select.select_by_visible_text('24 hours')

    '''
    # wait for captcha page show up
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="captcha"]')))

    # save captcha to local
    driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png')

    # This method will show image in any image viewer
    im = Image.open(r'..\Kingdom\captcha2.png')
    im.show()

    # wait until input space show up
    inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]')

    # ask user input captcha solution in terminal
    userIn = input("Enter solution: ")

    # send user solution into the input space
    inputBox.send_keys(userIn)

    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click()
    '''
    input("Press ENTER when CAPTCHA is completed\n")
    input("Press ENTER when CAPTCHA and DDOS is completed\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div/div/div[3]/div[2]')))
        (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]')))




 # Returns the name of the website
 def getMarketName():
 def getMKTName():
    name = 'Kingdom'
    return name

@ -236,30 +186,17 @@ def getInterestedLinks():
    links = []

    # Software and Malware
    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32')
    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0')
    # # Services
    # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32')
    # # Exploits
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')
    # # Tools
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')
    # # Malware
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')
    # # Cryptography
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')
    # # Others
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')
    # # Hacking Tutorials
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')
    # # Hacked Accounts and Database Dumps
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
    # # Android Moded pak
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0')
    # # guides and tutorials
    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0')

    return links


 def crawlForum(driver):

    print("Crawling the Kingdom market")

    linksToCrawl = getInterestedLinks()
@ -281,6 +218,7 @@ def crawlForum(driver):
                savePage(driver, html, link)

                list = productPages(html)

                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
@ -290,18 +228,15 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()

                    # comment out
                    break

                # comment out
                if count == 1:
                    break
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break

                try:
                    temp = driver.find_element(by=By.XPATH, value=
                        '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul')
                    next = temp.find_element_by_class_name("next")
                    link = link.find_element_by_tag_name('a').get_attribute('href')
                    link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
@ -313,7 +248,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n")
        print("Crawling the Kingdom market done.")


 # Returns 'True' if the link is Topic link
@ -325,7 +260,7 @@ def isDescriptionLink(url):

 # Returns True if the link is a listingPage link
 def isListingLink(url):
    if 'category' in url:
    if 'filter_category' in url:
        return True
    return False

@ -333,10 +268,8 @@ def isListingLink(url):
 # calling the parser to define the links
 def productPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
    return kingdom_links_parser(soup)


 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
    startCrawling()
--- a/MarketPlaces/Kingdom/parser.py
+++ b/MarketPlaces/Kingdom/parser.py
@ -1,4 +1,4 @@
 __author__ = 'DarkWeb'
 __author__ = 'Helium'

 # Here, we are importing the auxiliary functions to clean or convert data
 from MarketPlaces.Utilities.utilities import *
@ -31,6 +31,8 @@ def kingdom_description_parser(soup):
    left = "-1"                         # 16 Product_QuantityLeft
    shipFrom = "-1"                     # 17 Product_ShippedFrom
    shipTo = "-1"                       # 18 Product_ShippedTo
    image = "-1"  # 19 Product_Image
    vendor_image = "-1"  # 20 Vendor_Image

    # Finding Product Name

@ -95,7 +97,7 @@ def kingdom_description_parser(soup):
    # Populating the final variable (this should be a list with all fields scraped)

    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
           BTC, USD, EURO, sold, left, shipFrom, shipTo)
           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)

    # Sending the results

@ -126,7 +128,9 @@ def kingdom_listing_parser(soup):
    qLeft =[]                                 # 17 Product_QuantityLeft
    shipFrom = []                             # 18 Product_ShippedFrom
    shipTo = []                               # 19 Product_ShippedTo
    href = []                                 # 20 Product_Links
    image = []  # 20 Product_Image
    image_vendor = []  # 21 Vendor_Image
    href = []  # 22 Product_Links

    listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False)

@ -153,12 +157,20 @@ def kingdom_listing_parser(soup):
        product = product.strip()
        name.append(product)

        # Finding Product Image
        product_image = a.find('img')
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)

        # Finding the Vendor
        vendor_name = a.select_one('a[href^="/user"]').text
        vendor_name = vendor_name.replace(",", " ").replace('/', '')
        vendor_name = vendor_name.strip()
        vendor.append(vendor_name)

        image_vendor.append("-1")

        # Adding the url to the list of urls
        link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href']
        link = cleanLink(link)
@ -169,7 +181,8 @@ def kingdom_listing_parser(soup):

    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href,
                            image, image_vendor)


 def kingdom_links_parser(soup):