Browse Source

added image aes encryption, base64 encoding, and html embedding to MarketPlaces

main
westernmeadow 1 year ago
parent
commit
dde9f499be
22 changed files with 213 additions and 94 deletions
  1. +3
    -6
      Forums/Utilities/utilities.py
  2. +4
    -4
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  3. +4
    -4
      MarketPlaces/Apocalypse/crawler_selenium.py
  4. +4
    -4
      MarketPlaces/BlackPyramid/crawler_selenium.py
  5. +4
    -4
      MarketPlaces/CityMarket/crawler_selenium.py
  6. +4
    -4
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  7. +4
    -4
      MarketPlaces/DarkFox/crawler_selenium.py
  8. +4
    -4
      MarketPlaces/DarkMatter/crawler_selenium.py
  9. +4
    -4
      MarketPlaces/DarkTor/crawler_selenium.py
  10. +4
    -4
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  11. +4
    -4
      MarketPlaces/HiddenMarket/crawler_selenium.py
  12. +4
    -4
      MarketPlaces/LionMarketplace/crawler_selenium.py
  13. +4
    -4
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  14. +4
    -4
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  15. +5
    -5
      MarketPlaces/RobinhoodMarket/crawler_selenium.py
  16. +4
    -4
      MarketPlaces/ThiefWorld/crawler_selenium.py
  17. +4
    -4
      MarketPlaces/Tor2door/crawler_selenium.py
  18. +4
    -4
      MarketPlaces/TorBay/crawler_selenium.py
  19. +4
    -4
      MarketPlaces/TorMarket/crawler_selenium.py
  20. +122
    -3
      MarketPlaces/Utilities/utilities.py
  21. +5
    -5
      MarketPlaces/ViceCity/crawler_selenium.py
  22. +10
    -7
      setup.ini

+ 3
- 6
Forums/Utilities/utilities.py View File

@ -4,11 +4,8 @@ import string
import time
import re
import hashlib
import imghdr
import base64
import requests
import io
import urllib.parse as urlparse
from datetime import datetime, timedelta
import datetime as fulldatetime
from bs4 import BeautifulSoup
@ -22,11 +19,11 @@ from PIL import Image
def generate_aes_key():
from Forums.Initialization.forums_mining import config
password = "password"
password_bytes = bytes(password, encoding="utf-8")
secret = config.get('Encryption', 'secret')
secret_bytes = bytes(secret, encoding="utf-8")
# Derive a key from the seed using PBKDF2
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=password_bytes, salt=bytes(), iterations=1)
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
# Use the first 16 bytes of the derived key as the AES key
aes_key = key[:16]


+ 4
- 4
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -145,8 +145,8 @@ def login(driver):
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -217,7 +217,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -226,7 +226,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -161,8 +161,8 @@ def login(driver):
(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -231,7 +231,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -240,7 +240,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -160,8 +160,8 @@ def login(driver):
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -237,7 +237,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -246,7 +246,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/CityMarket/crawler_selenium.py View File

@ -158,8 +158,8 @@ def login(driver):
(By.XPATH, '//*[@id="collapse3"]')))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -236,7 +236,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -245,7 +245,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -159,8 +159,8 @@ def login(driver):
(By.XPATH, "/html/body/div[2]/div/div/div[1]/div/div/div[1]/div[2]/ul/li[8]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -229,7 +229,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -238,7 +238,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/DarkFox/crawler_selenium.py View File

@ -175,8 +175,8 @@ def captcha(driver):
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -254,7 +254,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -263,7 +263,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -145,8 +145,8 @@ def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -221,7 +221,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -231,7 +231,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
time.sleep(1.5)
driver.back()
# to keep from detecting click speed


+ 4
- 4
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -144,8 +144,8 @@ def login(driver):
(By.XPATH, "/html/body/div[1]/div/div/div[2]/main/div/div/section[5]/div/div[1]/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -216,7 +216,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -225,7 +225,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -145,8 +145,8 @@ def login(driver):
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -221,7 +221,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -230,7 +230,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/HiddenMarket/crawler_selenium.py View File

@ -176,8 +176,8 @@ def getAccess():
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -262,7 +262,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -271,7 +271,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -145,8 +145,8 @@ def login(driver):
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -219,7 +219,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -228,7 +228,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# # comment out


+ 4
- 4
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -160,8 +160,8 @@ def login(driver):
(By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -230,7 +230,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -239,7 +239,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -172,8 +172,8 @@ def login(driver):
(By.XPATH, "/html/body/div[1]/header/div/div[3]/div/div/ul/li[6]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -242,7 +242,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -251,7 +251,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 5
- 5
MarketPlaces/RobinhoodMarket/crawler_selenium.py View File

@ -140,8 +140,8 @@ def getAccess():
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -199,7 +199,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
has_next_page = True
while has_next_page:
@ -211,7 +211,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out
# break
@ -233,7 +233,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
count += 1
except NoSuchElementException:


+ 4
- 4
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -153,8 +153,8 @@ def login(driver):
(By.ID, "side-bar")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -227,7 +227,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -236,7 +236,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -176,8 +176,8 @@ def getAccess():
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -243,7 +243,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -252,7 +252,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/TorBay/crawler_selenium.py View File

@ -145,8 +145,8 @@ def login(driver):
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -213,7 +213,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -222,7 +222,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 4
- 4
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -144,8 +144,8 @@ def login(driver):
(By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -216,7 +216,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -225,7 +225,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back()
# comment out


+ 122
- 3
MarketPlaces/Utilities/utilities.py View File

@ -3,7 +3,39 @@ __author__ = 'DarkWeb'
import string
import time
import re
import hashlib
import base64
import io
from datetime import datetime, timedelta
import datetime as fulldatetime
from bs4 import BeautifulSoup
from lxml import html as lxml
from selenium.webdriver.common.by import By
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from PIL import Image
def generate_aes_key():
from MarketPlaces.Initialization.markets_mining import config
secret = config.get('Encryption', 'secret')
secret_bytes = bytes(secret, encoding="utf-8")
# Derive a key from the seed using PBKDF2
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
# Use the first 16 bytes of the derived key as the AES key
aes_key = key[:16]
# print("key: ", aes_key)
return aes_key
BLOCK_SIZE = 32
aes_key = generate_aes_key()
encryptCipher = AES.new(aes_key, AES.MODE_ECB)
decryptCipher = AES.new(aes_key, AES.MODE_ECB)
def convertDate(sdate, language, crawlerDate):
@ -292,7 +324,96 @@ def cleanNumbers(inputString):
return updated_string
def cleanHTML(html):
def aes_encryption(item):
to_bytes = bytes(item)
encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
return encrypted_bytes
def aes_decryption(item):
to_bytes = bytes(item)
decrypted_bytes = decryptCipher.decrypt(to_bytes)
return unpad(decrypted_bytes, BLOCK_SIZE)
def encrypt_encode_image_to_base64(driver, xpath):
try:
img_element = driver.find_element(by=By.XPATH, value=xpath)
image_data = img_element.screenshot_as_png
encrypted_image = aes_encryption(image_data)
base64_image = base64.b64encode(encrypted_image)
string_image = base64_image.decode('utf-8')
return string_image
except:
pass
return None
def decode_decrypt_image_in_base64(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for img_tag in soup.find_all('img'):
src_attr = img_tag.get('src')
if src_attr and src_attr.startswith('data:image'):
try:
string_image = src_attr.split('base64,')[-1]
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
im = Image.open(io.BytesIO(decrypted_image))
im.show()
except Exception as e:
print(e)
pass
def replace_image_sources(driver, html_content):
tree = lxml.fromstring(html_content)
for picture_tag in tree.findall('.//picture'):
for source_tag in picture_tag.findall('.//source'):
picture_tag.remove(source_tag)
for img_tag in tree.findall('.//img'):
img_xpath = tree.getroottree().getpath(img_tag)
string_image = encrypt_encode_image_to_base64(driver, img_xpath)
if string_image:
img_tag.set('src', f'data:image/png;base64,{string_image}')
else:
img_tag.getparent().remove(img_tag)
modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
return modified_html
def cleanHTML(driver, html):
clean_html = replace_image_sources(driver, html)
# decode_decrypt_image_in_base64(clean_html)
formats = [
"jpg", "jpeg", "jfif", "pjpeg", "pjp",
@ -301,8 +422,6 @@ def cleanHTML(html):
]
# remove images
clean_html = re.sub(r"<img.*?>", "", html)
clean_html = re.sub(r"<picture.*?>", "", clean_html)
clean_html = re.sub(r"<svg.*?>", "", clean_html)
for fmat in formats:
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)


+ 5
- 5
MarketPlaces/ViceCity/crawler_selenium.py View File

@ -178,8 +178,8 @@ def login(driver):
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -251,7 +251,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
has_next_page = True
while has_next_page:
@ -264,7 +264,7 @@ def crawlForum(driver):
except:
driver.refresh()
time.sleep(2.5) # to let page catchup
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
time.sleep(2.5) # so site doesnt crash
driver.back()
@ -286,7 +286,7 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
count += 1
except NoSuchElementException:


+ 10
- 7
setup.ini View File

@ -1,15 +1,18 @@
[TOR]
firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe
firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe
[Project]
project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test
shared_folder = Z:\VBoxSvr\VM_Files_ (shared)
project_directory = C:\calsyslab\Project\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
[PostgreSQL]
ip = localhost
username = postgres
password = postgres
database = darkweb_markets_forums
password = password
database = darkweb_markets_forums
[Encryption]
secret = "password"

Loading…
Cancel
Save