diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc index a6674c4..f88c80b 100644 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc and b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc index ef25fdf..680753a 100644 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc and b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc differ diff --git a/MarketPlaces/AnonymousMarketplace/parser.py b/MarketPlaces/AnonymousMarketplace/parser.py index a0880b7..f513508 100644 --- a/MarketPlaces/AnonymousMarketplace/parser.py +++ b/MarketPlaces/AnonymousMarketplace/parser.py @@ -43,14 +43,14 @@ def anonymousMarketplace_description_parser(soup: Tag): product_ratings: Tag = soup.find("div", {"class": "star-rating"}) - product_reviews = product_ratings.find("strong", {"class": "rating"}).text + product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text reviews = cleanString(product_reviews.strip()) product_star_rating = product_ratings.find("span", {"class": "rating"}).text rating_item = cleanString(product_star_rating.strip()) - product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "") - USD = cleanString(product_price.strip()) + product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text + USD = cleanString(product_price.replace("$", "").strip()) # Populating the final variable (this should be a list with all fields scraped) @@ -88,26 +88,29 @@ def anonymousMarketplace_listing_parser(soup: Tag): href = [] # 20 Product_Links - product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li") + product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li") for item in product_list: item_href = item.find("a").get("href") href.append(item_href) - item_name = item.find("span", {"class": "product-title"}).text - name.append((item_name.strip())) + item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text + name.append(cleanString('item_name'.strip())) item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text - rating_item.append(cleanNumbers(item_rating.strip())) + rating_item.append(cleanString(item_rating.strip())) - item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text - if not item_price: + try: + item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text + item_price = item_price.replace("$", "").strip() + USD.append(item_price) + except AttributeError: USD.append("-1") - else: - USD.append(cleanNumbers(item_price.replace("$", "").strip())) + + - vendor.append("-1") + vendor.append("Anonymous") rating_vendor.append("-1") success.append("-1") CVE.append("-1") @@ -126,9 +129,30 @@ def anonymousMarketplace_listing_parser(soup: Tag): nm += 1 - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) - + return organizeProducts( + marketplace=mktName, + nm=nm, + vendor=vendor, + rating_vendor=rating_vendor, + success_vendor=success, + nombre=name, + CVE=CVE, + MS=MS, + category=category, + describe=describe, + views=views, + reviews=reviews, + rating_item=rating_item, + addDate=addDate, + BTC=BTC, + USD=USD, + EURO=EURO, + sold=sold, + qLeft=qLeft, + shipFrom=shipFrom, + shipTo=shipTo, + href=href + ) diff --git a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc index cd3f879..95b0bbf 100644 Binary files a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc and b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc differ diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index a1b27ff..d75d2e2 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -139,6 +139,7 @@ def create_vendor(cur, row, marketId): def create_items(cur, row, marketId, vendorId): + print(row) sql = "Insert into items (market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ diff --git a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc index b669316..0e40335 100644 Binary files a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc and b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc differ diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 8b61c4e..dc2853f 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -15532,3 +15532,40 @@ DevTools listening on ws://localhost:51081/devtools/browser/ef699bfb-b8a4-403a-a 1689136181511 RemoteAgent WARN TLS certificate errors will be ignored for this session JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1689359222170 geckodriver INFO Listening on 127.0.0.1:50340 +1689359225578 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50341" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileoAi5wB" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689359226071 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:50341/devtools/browser/3ff26efb-c8ef-4d86-bcac-6eb085f5e34c +1689359227185 Marionette INFO Listening on port 50348 +1689359227621 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1689359262127 Marionette INFO Stopped listening on port 50348 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileoAi5wB\thumbnails) because it does not exist + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689359262560 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 19c6d10..2cf7709 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -M00nkeyMarket +AnonymousMarketplace \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 4d5508b..a5336bc 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -149,6 +149,7 @@ def new_parse(marketPlace, url, createLog): elif marketPlace =="AnonymousMarketplace": rmm = anonymousMarketplace_description_parser(soup) + print(rmm) # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -250,7 +251,8 @@ def new_parse(marketPlace, url, createLog): try: persist_data(url, tuple(rec), cur) con.commit() - except: + except Exception as e: + raise e trace = traceback.format_exc() diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc index 1ea14d0..a940cdc 100644 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc and b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc index 77a5388..af3e012 100644 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc and b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc differ diff --git a/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc index 3dc0317..0ee63ec 100644 Binary files a/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc and b/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc differ diff --git a/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc b/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc index 1e8dc5a..da3d193 100644 Binary files a/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc and b/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc differ diff --git a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc index 5d4bbfc..ea57780 100644 Binary files a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc and b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc differ diff --git a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc index 990e55e..b8c67cf 100644 Binary files a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc and b/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc differ diff --git a/setup.ini b/setup.ini index 641d3f1..3bebcab 100644 --- a/setup.ini +++ b/setup.ini @@ -1,11 +1,11 @@ [TOR] -firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe -firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default -geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe +firefox_binary_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\nsf-reu\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test +project_directory = C:\nsf-reu\dw_pipeline_test shared_folder = \\VBoxSvr\\Shared [PostgreSQL]