From d1943e55866c7439790e52454590e041f060f44c Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Tue, 22 Aug 2023 16:35:10 -0700 Subject: [PATCH] added image aes encryption, base64 encoding, and html embedding --- Forums/AbyssForum/crawler_selenium.py | 12 +- Forums/Altenens/crawler_selenium.py | 28 ++-- Forums/BestCardingWorld/crawler_selenium.py | 153 +++++++++++--------- Forums/Cardingleaks/crawler_selenium.py | 4 +- Forums/CryptBB/crawler_selenium.py | 8 +- Forums/HiddenAnswers/crawler_selenium.py | 8 +- Forums/Libre/crawler_selenium.py | 8 +- Forums/OnniForums/crawler_selenium.py | 8 +- Forums/Procrax/crawler_selenium.py | 8 +- Forums/Utilities/utilities.py | 127 +++++++++++++++- 10 files changed, 251 insertions(+), 113 deletions(-) diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 4d2ad99..129e6dc 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -135,8 +135,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -206,7 +206,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -220,7 +220,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") + savePage(driver, driver.page_source, topic + f"page{counter}") # comment out if counter == 2: @@ -228,8 +228,8 @@ def crawlForum(driver): try: temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]') - item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') - if item == "": + page = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') + if page == "": raise NoSuchElementException counter += 1 diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 6eb813e..0f14223 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -32,17 +32,17 @@ baseURL = 'https://altenens.is/' def startCrawling(): # opentor() forumName = getForumName() - # driver = getAccess() - # - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) - new_parse(forumName, baseURL, True) + # new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -151,8 +151,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, html, url): + cleanPage = cleanHTML(driver, html) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -220,7 +220,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -234,7 +234,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py index 7e35381..96821cd 100644 --- a/Forums/BestCardingWorld/crawler_selenium.py +++ b/Forums/BestCardingWorld/crawler_selenium.py @@ -29,14 +29,14 @@ baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion def startCrawling(): # opentor() forumName = getForumName() - # driver = getAccess() + driver = getAccess() - # if driver != 'down': - # try: - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + if driver != 'down': + try: + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) new_parse(forumName, baseURL, True) @@ -44,10 +44,11 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from Forums.Initialization.forums_mining import config + global pid print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pid = pro.pid time.sleep(7.5) input('Tor Connected. Press ENTER to continue\n') @@ -71,9 +72,9 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver def closetor(driver): - global pid + # global pid # os.system("taskkill /pid " + str(pro.pid)) - os.system("taskkill /t /f /im tor.exe") + # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') driver.close() time.sleep(3) @@ -83,12 +84,11 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - file = open('../../path.txt', 'r') - lines = file.readlines() + from Forums.Initialization.forums_mining import config - ff_binary = FirefoxBinary(lines[0].strip()) + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) @@ -110,7 +110,7 @@ def createFFDriver(): ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() - service = Service(lines[2].strip()) + service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) @@ -131,8 +131,8 @@ def getAccess(): # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -142,15 +142,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'C:\Users\fakeguy\Documents\threatIntelligence-main\DarkWebMining_Working\Forums\BestCardingWorld\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'C:\Users\fakeguy\Documents\threatIntelligence-main\DarkWebMining_Working\Forums\BestCardingWorld\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -171,30 +170,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Penetration Tests - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43') + # # Penetration Tests + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43') # # Social Engineering Tests - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=44') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=44') # # Exploits - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') # # Tools - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') + # Malware links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') # # Cryptography - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') # # Others - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') # # Hacking Tutorials - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') # # Hacked Accounts and Database Dumps # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') # # Android Moded pak - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') - - - #General Discussion - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=16&sid=6a4959d49be41e72944e5aa5684c187a') + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') return links @@ -206,45 +201,70 @@ def crawlForum(driver): print("Crawling the BestCardingWorld forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + topics = topicPages(html) + for topic in topics: + has_next_topic_page = True + counter = 1 + page = topic + + while has_next_topic_page: + itemURL = urlparse.urljoin(baseURL, str(page)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, topic + f"page{counter}") + + # comment out + if counter == 2: + break + + try: + nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul') + li = nav.find_element_by_class_name('next') + page = li.find_element_by_tag_name('a').get_attribute('href') + if page == "": + raise NoSuchElementException + counter += 1 + + except NoSuchElementException: + has_next_topic_page = False + + # end of loop + for i in range(counter): + driver.back() + + # comment out + break + + # comment out + if count == 1: + break try: - bar = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div[2]/div[2]/div[3]/ul') + bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul') next = bar.find_element_by_class_name('next') link = next.find_element_by_tag_name('a').get_attribute('href') - - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) + if link == "": + raise NoSuchElementException + count += 1 except NoSuchElementException: has_next_page = False @@ -253,9 +273,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n") diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index de8dd0b..85538fd 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -160,7 +160,7 @@ def getAccess(): # Saves the crawled html page def savePage(page, url): - cleanPage = cleanHTML(page) + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -228,7 +228,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index bdc964c..5e98a7d 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -177,8 +177,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -254,7 +254,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -268,7 +268,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index 46e445e..6641b81 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -135,8 +135,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -202,7 +202,7 @@ def crawlForum(driver: webdriver.Firefox): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -216,7 +216,7 @@ def crawlForum(driver: webdriver.Firefox): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index 4697cda..d06cd83 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -159,8 +159,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -222,7 +222,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -236,7 +236,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index a7d0c15..58b1313 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -155,8 +155,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -232,7 +232,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -246,7 +246,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index fc54a30..f2ed372 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -153,8 +153,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -223,7 +223,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) topics = topicPages(html) for topic in topics: @@ -237,7 +237,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index c451758..fdd5495 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -3,8 +3,42 @@ __author__ = 'DarkWeb' import string import time import re +import hashlib +import imghdr +import base64 +import requests +import io +import urllib.parse as urlparse from datetime import datetime, timedelta import datetime as fulldatetime +from bs4 import BeautifulSoup +from lxml import html as lxml +from selenium.webdriver.common.by import By +from Crypto.Cipher import AES +from Crypto.Util.Padding import pad, unpad +from PIL import Image + + +def generate_aes_key(): + from Forums.Initialization.forums_mining import config + + password = "password" + password_bytes = bytes(password, encoding="utf-8") + + # Derive a key from the seed using PBKDF2 + key = hashlib.pbkdf2_hmac(hash_name='sha256', password=password_bytes, salt=bytes(), iterations=1) + + # Use the first 16 bytes of the derived key as the AES key + aes_key = key[:16] + + # print("key: ", aes_key) + return aes_key + + +BLOCK_SIZE = 32 +aes_key = generate_aes_key() +encryptCipher = AES.new(aes_key, AES.MODE_ECB) +decryptCipher = AES.new(aes_key, AES.MODE_ECB) def cleanText(originalText): @@ -269,7 +303,96 @@ def convertFromLongDate(longDate, crawlerdate): return correct_date -def cleanHTML(html): +def aes_encryption(item): + + to_bytes = bytes(item) + + encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) + + return encrypted_bytes + + +def aes_decryption(item): + + to_bytes = bytes(item) + + decrypted_bytes = decryptCipher.decrypt(to_bytes) + + return unpad(decrypted_bytes, BLOCK_SIZE) + + +def encrypt_encode_image_to_base64(driver, xpath): + + try: + + img_element = driver.find_element(by=By.XPATH, value=xpath) + image_data = img_element.screenshot_as_png + + encrypted_image = aes_encryption(image_data) + base64_image = base64.b64encode(encrypted_image) + string_image = base64_image.decode('utf-8') + + return string_image + + except: + pass + + return None + + +def decode_decrypt_image_in_base64(html_content): + + soup = BeautifulSoup(html_content, 'html.parser') + + for img_tag in soup.find_all('img'): + + src_attr = img_tag.get('src') + + if src_attr and src_attr.startswith('data:image'): + + try: + + string_image = src_attr.split('base64,')[-1] + base64_image = bytes(string_image, encoding='utf-8') + encrypted_image = base64.b64decode(base64_image) + decrypted_image = aes_decryption(encrypted_image) + + im = Image.open(io.BytesIO(decrypted_image)) + im.show() + + except Exception as e: + print(e) + pass + + +def replace_image_sources(driver, html_content): + + tree = lxml.fromstring(html_content) + + for picture_tag in tree.findall('.//picture'): + for source_tag in picture_tag.findall('.//source'): + picture_tag.remove(source_tag) + + for img_tag in tree.findall('.//img'): + + img_xpath = tree.getroottree().getpath(img_tag) + + string_image = encrypt_encode_image_to_base64(driver, img_xpath) + + if string_image: + img_tag.set('src', f'data:image/png;base64,{string_image}') + else: + img_tag.getparent().remove(img_tag) + + modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8') + + return modified_html + + +def cleanHTML(driver, html): + + clean_html = replace_image_sources(driver, html) + # decode_decrypt_image_in_base64(clean_html) formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp", @@ -278,8 +401,6 @@ def cleanHTML(html): ] # remove images - clean_html = re.sub(r"", "", html) - clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) for fmat in formats: clean_html = re.sub(r"", "", clean_html)