diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index fdd5495..a9e0026 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -4,11 +4,8 @@ import string import time import re import hashlib -import imghdr import base64 -import requests import io -import urllib.parse as urlparse from datetime import datetime, timedelta import datetime as fulldatetime from bs4 import BeautifulSoup @@ -22,11 +19,11 @@ from PIL import Image def generate_aes_key(): from Forums.Initialization.forums_mining import config - password = "password" - password_bytes = bytes(password, encoding="utf-8") + secret = config.get('Encryption', 'secret') + secret_bytes = bytes(secret, encoding="utf-8") # Derive a key from the seed using PBKDF2 - key = hashlib.pbkdf2_hmac(hash_name='sha256', password=password_bytes, salt=bytes(), iterations=1) + key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1) # Use the first 16 bytes of the derived key as the AES key aes_key = key[:16] diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py index d09469f..ef3b475 100644 --- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py +++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py @@ -145,8 +145,8 @@ def login(driver): (By.ID, "woocommerce_product_categories-2"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -217,7 +217,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -226,7 +226,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index 134f4d8..7b67cc4 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -161,8 +161,8 @@ def login(driver): (By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -231,7 +231,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -240,7 +240,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index 5ce0101..dd3e251 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -160,8 +160,8 @@ def login(driver): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -237,7 +237,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -246,7 +246,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index ff30bf0..6fbc683 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -158,8 +158,8 @@ def login(driver): (By.XPATH, '//*[@id="collapse3"]'))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -236,7 +236,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -245,7 +245,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index 120ed32..b39a74a 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -159,8 +159,8 @@ def login(driver): (By.XPATH, "/html/body/div[2]/div/div/div[1]/div/div/div[1]/div[2]/ul/li[8]/a"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -229,7 +229,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -238,7 +238,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/DarkFox/crawler_selenium.py b/MarketPlaces/DarkFox/crawler_selenium.py index 61927d6..0f7ee1d 100644 --- a/MarketPlaces/DarkFox/crawler_selenium.py +++ b/MarketPlaces/DarkFox/crawler_selenium.py @@ -175,8 +175,8 @@ def captcha(driver): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -254,7 +254,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -263,7 +263,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index fffd3fd..c1eb457 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -145,8 +145,8 @@ def login(driver): # wait for page to show up (This Xpath may need to change based on different seed url) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -221,7 +221,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -231,7 +231,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) time.sleep(1.5) driver.back() # to keep from detecting click speed diff --git a/MarketPlaces/DarkTor/crawler_selenium.py b/MarketPlaces/DarkTor/crawler_selenium.py index d84de5c..74e22be 100644 --- a/MarketPlaces/DarkTor/crawler_selenium.py +++ b/MarketPlaces/DarkTor/crawler_selenium.py @@ -144,8 +144,8 @@ def login(driver): (By.XPATH, "/html/body/div[1]/div/div/div[2]/main/div/div/section[5]/div/div[1]/div"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -216,7 +216,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -225,7 +225,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index 58c833a..28424a8 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -145,8 +145,8 @@ def login(driver): (By.ID, "woocommerce_product_categories-2"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -221,7 +221,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -230,7 +230,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/HiddenMarket/crawler_selenium.py index c89d467..1b3e1b5 100644 --- a/MarketPlaces/HiddenMarket/crawler_selenium.py +++ b/MarketPlaces/HiddenMarket/crawler_selenium.py @@ -176,8 +176,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -262,7 +262,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -271,7 +271,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index d6a613b..7558452 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -145,8 +145,8 @@ def login(driver): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -219,7 +219,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -228,7 +228,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # # comment out diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index f0b41a4..83413fc 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -160,8 +160,8 @@ def login(driver): (By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -230,7 +230,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -239,7 +239,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index bb7d1f8..b6b67ac 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -172,8 +172,8 @@ def login(driver): (By.XPATH, "/html/body/div[1]/header/div/div[3]/div/div/ul/li[6]/a"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -242,7 +242,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -251,7 +251,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py index 88deb1e..ab22f78 100644 --- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py +++ b/MarketPlaces/RobinhoodMarket/crawler_selenium.py @@ -140,8 +140,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -199,7 +199,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) has_next_page = True while has_next_page: @@ -211,7 +211,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out # break @@ -233,7 +233,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) count += 1 except NoSuchElementException: diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 970a390..194f449 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -153,8 +153,8 @@ def login(driver): (By.ID, "side-bar"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -227,7 +227,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -236,7 +236,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index 964c574..858ddcf 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -176,8 +176,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -243,7 +243,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -252,7 +252,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py index 0861e82..7968089 100644 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ b/MarketPlaces/TorBay/crawler_selenium.py @@ -145,8 +145,8 @@ def login(driver): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -213,7 +213,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -222,7 +222,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index ed94a8b..7569045 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -144,8 +144,8 @@ def login(driver): (By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a"))) # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -216,7 +216,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -225,7 +225,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() # comment out diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index 60798fb..e8bea9f 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -3,7 +3,39 @@ __author__ = 'DarkWeb' import string import time import re +import hashlib +import base64 +import io from datetime import datetime, timedelta +import datetime as fulldatetime +from bs4 import BeautifulSoup +from lxml import html as lxml +from selenium.webdriver.common.by import By +from Crypto.Cipher import AES +from Crypto.Util.Padding import pad, unpad +from PIL import Image + + +def generate_aes_key(): + from MarketPlaces.Initialization.markets_mining import config + + secret = config.get('Encryption', 'secret') + secret_bytes = bytes(secret, encoding="utf-8") + + # Derive a key from the seed using PBKDF2 + key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1) + + # Use the first 16 bytes of the derived key as the AES key + aes_key = key[:16] + + # print("key: ", aes_key) + return aes_key + + +BLOCK_SIZE = 32 +aes_key = generate_aes_key() +encryptCipher = AES.new(aes_key, AES.MODE_ECB) +decryptCipher = AES.new(aes_key, AES.MODE_ECB) def convertDate(sdate, language, crawlerDate): @@ -292,7 +324,96 @@ def cleanNumbers(inputString): return updated_string -def cleanHTML(html): +def aes_encryption(item): + + to_bytes = bytes(item) + + encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) + + return encrypted_bytes + + +def aes_decryption(item): + + to_bytes = bytes(item) + + decrypted_bytes = decryptCipher.decrypt(to_bytes) + + return unpad(decrypted_bytes, BLOCK_SIZE) + + +def encrypt_encode_image_to_base64(driver, xpath): + + try: + + img_element = driver.find_element(by=By.XPATH, value=xpath) + image_data = img_element.screenshot_as_png + + encrypted_image = aes_encryption(image_data) + base64_image = base64.b64encode(encrypted_image) + string_image = base64_image.decode('utf-8') + + return string_image + + except: + pass + + return None + + +def decode_decrypt_image_in_base64(html_content): + + soup = BeautifulSoup(html_content, 'html.parser') + + for img_tag in soup.find_all('img'): + + src_attr = img_tag.get('src') + + if src_attr and src_attr.startswith('data:image'): + + try: + + string_image = src_attr.split('base64,')[-1] + base64_image = bytes(string_image, encoding='utf-8') + encrypted_image = base64.b64decode(base64_image) + decrypted_image = aes_decryption(encrypted_image) + + im = Image.open(io.BytesIO(decrypted_image)) + im.show() + + except Exception as e: + print(e) + pass + + +def replace_image_sources(driver, html_content): + + tree = lxml.fromstring(html_content) + + for picture_tag in tree.findall('.//picture'): + for source_tag in picture_tag.findall('.//source'): + picture_tag.remove(source_tag) + + for img_tag in tree.findall('.//img'): + + img_xpath = tree.getroottree().getpath(img_tag) + + string_image = encrypt_encode_image_to_base64(driver, img_xpath) + + if string_image: + img_tag.set('src', f'data:image/png;base64,{string_image}') + else: + img_tag.getparent().remove(img_tag) + + modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8') + + return modified_html + + +def cleanHTML(driver, html): + + clean_html = replace_image_sources(driver, html) + # decode_decrypt_image_in_base64(clean_html) formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp", @@ -301,8 +422,6 @@ def cleanHTML(html): ] # remove images - clean_html = re.sub(r"", "", html) - clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) for fmat in formats: clean_html = re.sub(r"", "", clean_html) diff --git a/MarketPlaces/ViceCity/crawler_selenium.py b/MarketPlaces/ViceCity/crawler_selenium.py index e49bbda..91b08cd 100644 --- a/MarketPlaces/ViceCity/crawler_selenium.py +++ b/MarketPlaces/ViceCity/crawler_selenium.py @@ -178,8 +178,8 @@ def login(driver): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -251,7 +251,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) has_next_page = True while has_next_page: @@ -264,7 +264,7 @@ def crawlForum(driver): except: driver.refresh() time.sleep(2.5) # to let page catchup - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) time.sleep(2.5) # so site doesnt crash driver.back() @@ -286,7 +286,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) count += 1 except NoSuchElementException: diff --git a/setup.ini b/setup.ini index 1202985..41b32d0 100644 --- a/setup.ini +++ b/setup.ini @@ -1,15 +1,18 @@ [TOR] -firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe -firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe +firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test -shared_folder = Z:\VBoxSvr\VM_Files_ (shared) +project_directory = C:\calsyslab\Project\dw_pipeline_test +shared_folder = \\VBoxSvr\\Shared [PostgreSQL] ip = localhost username = postgres -password = postgres -database = darkweb_markets_forums \ No newline at end of file +password = password +database = darkweb_markets_forums + +[Encryption] +secret = "password" \ No newline at end of file