Browse Source

Finished with parser for AnonymousMarketplace

main
Khoi 1 year ago
parent
commit
51f7920a49
16 changed files with 85 additions and 21 deletions
  1. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc
  2. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc
  3. +39
    -15
      MarketPlaces/AnonymousMarketplace/parser.py
  4. BIN
      MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc
  5. +1
    -0
      MarketPlaces/DB_Connection/db_connection.py
  6. BIN
      MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  7. +37
    -0
      MarketPlaces/Initialization/geckodriver.log
  8. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  9. +3
    -1
      MarketPlaces/Initialization/prepare_parser.py
  10. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc
  11. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc
  12. BIN
      MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc
  13. BIN
      MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc
  14. BIN
      MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc
  15. BIN
      MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc
  16. +4
    -4
      setup.ini

BIN
MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc View File


+ 39
- 15
MarketPlaces/AnonymousMarketplace/parser.py View File

@ -43,14 +43,14 @@ def anonymousMarketplace_description_parser(soup: Tag):
product_ratings: Tag = soup.find("div", {"class": "star-rating"})
product_reviews = product_ratings.find("strong", {"class": "rating"}).text
product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text
reviews = cleanString(product_reviews.strip())
product_star_rating = product_ratings.find("span", {"class": "rating"}).text
rating_item = cleanString(product_star_rating.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
USD = cleanString(product_price.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD = cleanString(product_price.replace("$", "").strip())
# Populating the final variable (this should be a list with all fields scraped)
@ -88,26 +88,29 @@ def anonymousMarketplace_listing_parser(soup: Tag):
href = [] # 20 Product_Links
product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li")
product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li")
for item in product_list:
item_href = item.find("a").get("href")
href.append(item_href)
item_name = item.find("span", {"class": "product-title"}).text
name.append((item_name.strip()))
item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(>'item_name'.strip()))
item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
rating_item.append(cleanNumbers(item_rating.strip()))
rating_item.append(cleanString(item_rating.strip()))
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
if not item_price:
try:
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
item_price = item_price.replace("$", "").strip()
USD.append(item_price)
except AttributeError:
USD.append("-1")
else:
USD.append(cleanNumbers(item_price.replace("$", "").strip()))
vendor.append("-1")
vendor.append("Anonymous")
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
@ -126,9 +129,30 @@ def anonymousMarketplace_listing_parser(soup: Tag):
nm += 1
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
return organizeProducts(
marketplace=mktName,
nm=nm,
vendor=vendor,
rating_vendor=rating_vendor,
success_vendor=success,
nombre=name,
CVE=CVE,
MS=MS,
category=category,
describe=describe,
views=views,
reviews=reviews,
rating_item=rating_item,
addDate=addDate,
BTC=BTC,
USD=USD,
EURO=EURO,
sold=sold,
qLeft=qLeft,
shipFrom=shipFrom,
shipTo=shipTo,
href=href
)


BIN
MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc View File


+ 1
- 0
MarketPlaces/DB_Connection/db_connection.py View File

@ -139,6 +139,7 @@ def create_vendor(cur, row, marketId):
def create_items(cur, row, marketId, vendorId):
print(row)
sql = "Insert into items (market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \


BIN
MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 37
- 0
MarketPlaces/Initialization/geckodriver.log View File

@ -15532,3 +15532,40 @@ DevTools listening on ws://localhost:51081/devtools/browser/ef699bfb-b8a4-403a-a
1689136181511 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1689359222170 geckodriver INFO Listening on 127.0.0.1:50340
1689359225578 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50341" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileoAi5wB"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689359226071 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:50341/devtools/browser/3ff26efb-c8ef-4d86-bcac-6eb085f5e34c
1689359227185 Marionette INFO Listening on port 50348
1689359227621 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1689359262127 Marionette INFO Stopped listening on port 50348
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileoAi5wB\thumbnails) because it does not exist
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689359262560 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
M00nkeyMarket
AnonymousMarketplace

+ 3
- 1
MarketPlaces/Initialization/prepare_parser.py View File

@ -149,6 +149,7 @@ def new_parse(marketPlace, url, createLog):
elif marketPlace =="AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
print(rmm)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -250,7 +251,8 @@ def new_parse(marketPlace, url, createLog):
try:
persist_data(url, tuple(rec), cur)
con.commit()
except:
except Exception as e:
raise e
trace = traceback.format_exc()


BIN
MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc View File


BIN
MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc View File


+ 4
- 4
setup.ini View File

@ -1,11 +1,11 @@
[TOR]
firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe
firefox_binary_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\nsf-reu\dw_pipeline_test\selenium\geckodriver.exe
[Project]
project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
project_directory = C:\nsf-reu\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
[PostgreSQL]


Loading…
Cancel
Save