From 9fca859758f2621e6051cb85ed751427a23c6c67 Mon Sep 17 00:00:00 2001 From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com> Date: Sun, 15 Oct 2023 14:00:28 -0700 Subject: [PATCH 01/11] Forums and Markets status. --- .idea/DW_Pipeline_Test.iml | 2 +- .idea/misc.xml | 2 +- Forums/DB_Connection/db_connection.py | 23 +++++++++++++++++++ Forums/Initialization/forumsList.txt | 9 +------- Forums/Initialization/prepare_parser.py | 6 +++++ MarketPlaces/DB_Connection/db_connection.py | 23 +++++++++++++++++++ MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/prepare_parser.py | 6 +++++ setup.ini | 14 ++++++----- 9 files changed, 70 insertions(+), 17 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 8489f64..9f9af70 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 11f1ea0..653c6ff 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..f0d4ed6 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -484,6 +484,24 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, status): + + date = datetime.strptime(date, "%m%d%Y") + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'status': status, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)" + recset = [forumId, date, status] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,6 +514,11 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ + "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 9cfeb56..efa9686 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1,8 +1 @@ -Altenens -BestCardingWorld -Cardingleaks -CryptBB -HiddenAnswers -Libre -OnniForums -Procrax \ No newline at end of file +BestCardingWorld \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 91b662f..1f55319 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,6 +341,12 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..4f439f0 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -401,6 +401,24 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, status): + + date = datetime.strptime(date, "%m%d%Y") + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'status': status, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)" + recset = [marketId, date, status] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -413,6 +431,11 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 87f811c..8b944c5 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ThiefWorld \ No newline at end of file +ViceCity \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index e075541..c56054e 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -363,6 +363,12 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/setup.ini b/setup.ini index 29997a6..883e495 100644 --- a/setup.ini +++ b/setup.ini @@ -1,17 +1,19 @@ [TOR] -firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe -firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe +firefox_binary_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test\selenium\geckodriver.exe + +"C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe" [Project] -project_directory = C:\calsyslab\Project\dw_pipeline_test -shared_folder = \\VBoxSvr\Shared +project_directory = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test +shared_folder = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test [PostgreSQL] ip = localhost username = postgres -password = password +password = 123 database = darkweb_markets_forums [Encryption] From 44561c509fb7ed1a1923ab5b25a328117d2228d1 Mon Sep 17 00:00:00 2001 From: Ericsson Santana Marin Date: Sun, 15 Oct 2023 22:12:17 +0000 Subject: [PATCH 02/11] Revert "Forums and Markets status." This reverts commit 9fca859758f2621e6051cb85ed751427a23c6c67 --- .idea/DW_Pipeline_Test.iml | 2 +- .idea/misc.xml | 2 +- Forums/DB_Connection/db_connection.py | 23 ------------------- Forums/Initialization/forumsList.txt | 9 +++++++- Forums/Initialization/prepare_parser.py | 6 ----- MarketPlaces/DB_Connection/db_connection.py | 23 ------------------- MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/prepare_parser.py | 6 ----- setup.ini | 14 +++++------ 9 files changed, 17 insertions(+), 70 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 9f9af70..8489f64 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 653c6ff..11f1ea0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index f0d4ed6..e4f6c5d 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -484,24 +484,6 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) -def create_status(cur, forumId, date, status): - - date = datetime.strptime(date, "%m%d%Y") - - # checking if status already exists - sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" - cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) - - recset = cur.fetchall() - if recset: - sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" - recset = {'status': status, 'forum_id': forumId, 'date_inserted': date} - else: - sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)" - recset = [forumId, date, status] - - cur.execute(sql, recset) - def create_database(cur, con): try: @@ -514,11 +496,6 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) - sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ - "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ - "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" - cur.execute(sql) - sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index efa9686..9cfeb56 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1,8 @@ -BestCardingWorld \ No newline at end of file +Altenens +BestCardingWorld +Cardingleaks +CryptBB +HiddenAnswers +Libre +OnniForums +Procrax \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 1f55319..91b662f 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,12 +341,6 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) - # registering the current forum status (up/down) in the database - forumId = verifyForum(cur, forum) - if (forumId > 0): - create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0') - con.commit() - if createLog: logFile.close() diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 4f439f0..8769869 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -401,24 +401,6 @@ def create_items(cur, row, marketId, vendorId): return itemId -def create_status(cur, marketId, date, status): - - date = datetime.strptime(date, "%m%d%Y") - - # checking if status already exists - sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" - cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) - - recset = cur.fetchall() - if recset: - sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s" - recset = {'status': status, 'market_id': marketId, 'date_inserted': date} - else: - sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)" - recset = [marketId, date, status] - - cur.execute(sql, recset) - def create_database(cur, con): try: @@ -431,11 +413,6 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) - sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ - "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ - "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" - cur.execute(sql) - sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 8b944c5..87f811c 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ViceCity \ No newline at end of file +ThiefWorld \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c56054e..e075541 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -363,12 +363,6 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) - # registering the current forum status (up/down) in the database - marketId = verifyMarketPlace(cur, marketPlace) - if (marketId > 0): - create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0') - con.commit() - if createLog: logFile.close() diff --git a/setup.ini b/setup.ini index 883e495..29997a6 100644 --- a/setup.ini +++ b/setup.ini @@ -1,19 +1,17 @@ [TOR] -firefox_binary_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe -firefox_profile_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -geckodriver_path = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test\selenium\geckodriver.exe - -"C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe" +firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test -shared_folder = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test +project_directory = C:\calsyslab\Project\dw_pipeline_test +shared_folder = \\VBoxSvr\Shared [PostgreSQL] ip = localhost username = postgres -password = 123 +password = password database = darkweb_markets_forums [Encryption] From ed5a9193e19b1ef6fef6dd714920306b20c6dfd6 Mon Sep 17 00:00:00 2001 From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:20:02 -0700 Subject: [PATCH 03/11] Forums and Markets status. --- Forums/DB_Connection/db_connection.py | 23 +++++++++++++++++++ Forums/Initialization/prepare_parser.py | 6 +++++ MarketPlaces/DB_Connection/db_connection.py | 23 +++++++++++++++++++ MarketPlaces/Initialization/prepare_parser.py | 6 +++++ 4 files changed, 58 insertions(+) diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..f0d4ed6 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -484,6 +484,24 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, status): + + date = datetime.strptime(date, "%m%d%Y") + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'status': status, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)" + recset = [forumId, date, status] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,6 +514,11 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ + "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 91b662f..1f55319 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,6 +341,12 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..4f439f0 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -401,6 +401,24 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, status): + + date = datetime.strptime(date, "%m%d%Y") + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'status': status, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)" + recset = [marketId, date, status] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -413,6 +431,11 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index e075541..c56054e 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -363,6 +363,12 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() From 61c85e05473cd81bb5ae181957cf6f899cdd04e9 Mon Sep 17 00:00:00 2001 From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:38:24 -0700 Subject: [PATCH 04/11] Forums and Markets status. --- MarketPlaces/Initialization/prepare_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c56054e..f3c792a 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -363,7 +363,7 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) - # registering the current forum status (up/down) in the database + # registering the current market status (up/down) in the database marketId = verifyMarketPlace(cur, marketPlace) if (marketId > 0): create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0') From 07bfc887b18581f7ce02dab9befb2e10abc72899 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Wed, 25 Oct 2023 16:30:57 -0700 Subject: [PATCH 05/11] don't use cleanLink --- MarketPlaces/DarkBazar/crawler_selenium.py | 12 ++++++------ MarketPlaces/DarkBazar/parser.py | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index fdfb640..d351c42 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index 9386d18..3d56e92 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Product From f0003d4b386ecf7188ee01bb53a40bcd1264ed06 Mon Sep 17 00:00:00 2001 From: Helium Date: Thu, 26 Oct 2023 13:14:25 -0700 Subject: [PATCH 06/11] kingdom completed for initial testing, might need to create new account every once in a while. the og account was deleted --- MarketPlaces/Initialization/prepare_parser.py | 6 + MarketPlaces/Kingdom/crawler_selenium.py | 121 ++++-------------- MarketPlaces/Kingdom/parser.py | 21 ++- 3 files changed, 50 insertions(+), 98 deletions(-) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index f3c792a..982995f 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,6 +15,8 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.Kingdom.parser import * + from MarketPlaces.Classifier.classify_product import predict @@ -130,6 +132,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "Kingdom": + rw = kingdom_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -164,6 +168,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "Kingdom": + rmm = kingdom_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index e6b489f..5385150 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' ''' Kingdom Market Crawler (Selenium) @@ -35,55 +35,27 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion # Opens Tor Browser, crawls the website def startCrawling(): - # marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': try: - captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - # new_parse(marketName, False) + new_parse(mktName, baseURL, True) +# Login using premade account credentials and do login captcha manually +def login(driver): -def captcha(driver): - ''' - # wait for captcha page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( - r'..\Kingdom\captcha1.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha1.png') - im.show() - - iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') - - # ask user input captcha solution in terminal - print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") - for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: - id = input(f"{order}: ") - iframes[int(id)-1].click() - ''' input("Press ENTER when CAPTCHA is completed\n") # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - # wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + (By.XPATH, '//*[@id="login-form"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') @@ -96,39 +68,17 @@ def login(driver): select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) select.select_by_visible_text('24 hours') - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="captcha"]'))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha2.png') - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA and DDOS is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) + + # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Kingdom' return name @@ -236,30 +186,17 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0') # # Services - # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') - # # Exploits - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') - # # Tools - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') - # # Cryptography - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') - # # Others - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') - # # Hacking Tutorials - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0') + # # guides and tutorials + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0') return links def crawlForum(driver): + print("Crawling the Kingdom market") linksToCrawl = getInterestedLinks() @@ -281,6 +218,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -290,18 +228,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') - next = temp.find_element_by_class_name("next") - link = link.find_element_by_tag_name('a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -313,7 +248,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link @@ -325,7 +260,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'category' in url: + if 'filter_category' in url: return True return False @@ -333,10 +268,8 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return kingdom_links_parser(soup) def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index b1e05d5..abade27 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -31,6 +31,8 @@ def kingdom_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name @@ -95,7 +97,7 @@ def kingdom_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results @@ -126,7 +128,9 @@ def kingdom_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) @@ -153,12 +157,20 @@ def kingdom_listing_parser(soup): product = product.strip() name.append(product) + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text vendor_name = vendor_name.replace(",", " ").replace('/', '') vendor_name = vendor_name.strip() vendor.append(vendor_name) + image_vendor.append("-1") + # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] link = cleanLink(link) @@ -169,7 +181,8 @@ def kingdom_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, + image, image_vendor) def kingdom_links_parser(soup): From 1d091b944acb9f2f8e01b2a377cedab5201c9076 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Fri, 27 Oct 2023 14:34:07 -0700 Subject: [PATCH 07/11] removed indent --- MarketPlaces/DarkBazar/crawler_selenium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index d351c42..dac91b0 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -236,7 +236,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the DarkBazar market done.") + print("Crawling the DarkBazar market done.") # Returns 'True' if the link is Topic link, may need to change for every website From b084d76d3ed6277bbb6e030828be564503321d0c Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Fri, 27 Oct 2023 15:13:58 -0700 Subject: [PATCH 08/11] listings, descriptions, and reference date --- Forums/DB_Connection/db_connection.py | 19 +++++++++------ Forums/Initialization/prepare_parser.py | 8 +++++-- MarketPlaces/DB_Connection/db_connection.py | 24 ++++++++++++------- MarketPlaces/Initialization/prepare_parser.py | 8 +++++-- 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index f0d4ed6..dfdec49 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -3,7 +3,7 @@ __author__ = 'DarkWeb' import psycopg2 import traceback from Forums.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -484,21 +484,25 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) -def create_status(cur, forumId, date, status): +def create_status(cur, forumId, date, listings, descriptions, status): date = datetime.strptime(date, "%m%d%Y") + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + # checking if status already exists sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) recset = cur.fetchall() if recset: - sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" - recset = {'status': status, 'forum_id': forumId, 'date_inserted': date} + sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date} else: - sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)" - recset = [forumId, date, status] + sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [forumId, date, listings, descriptions, status, date_reference] cur.execute(sql, recset) @@ -514,7 +518,8 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) - sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" cur.execute(sql) diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 1f55319..31982fd 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,10 +341,14 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) - # registering the current forum status (up/down) in the database + # registering the current forum status (up/down) and the number of scraped pages in the database forumId = verifyForum(cur, forum) if (forumId > 0): - create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm')) + + create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() if createLog: diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 4f439f0..2f3341a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 import traceback import configparser from MarketPlaces.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId): if newItem: + # decode_decrypt_image_in_base64(row[20]) + sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ @@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - # decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0]['image_item']) if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or @@ -401,24 +403,27 @@ def create_items(cur, row, marketId, vendorId): return itemId -def create_status(cur, marketId, date, status): +def create_status(cur, marketId, date, listings, descriptions, status): date = datetime.strptime(date, "%m%d%Y") + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + # checking if status already exists sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) recset = cur.fetchall() if recset: - sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s" - recset = {'status': status, 'market_id': marketId, 'date_inserted': date} + sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date} else: - sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)" - recset = [marketId, date, status] + sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [marketId, date, listings, descriptions, status, date_reference] cur.execute(sql, recset) - def create_database(cur, con): try: @@ -431,7 +436,8 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) - sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \ + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" cur.execute(sql) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 982995f..7c35f5a 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -369,10 +369,14 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) - # registering the current market status (up/down) in the database + # registering the current forum status (up/down) and the number of scraped pages in the database marketId = verifyMarketPlace(cur, marketPlace) if (marketId > 0): - create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0') + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm')) + + create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() if createLog: From c976032cc40945cf8a660ab9551e6366a95f6927 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Fri, 27 Oct 2023 15:35:31 -0700 Subject: [PATCH 09/11] small fixes --- Forums/Initialization/prepare_parser.py | 4 ++-- MarketPlaces/Initialization/prepare_parser.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 31982fd..b86b5c6 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -345,8 +345,8 @@ def new_parse(forum, url, createLog): forumId = verifyForum(cur, forum) if (forumId > 0): - readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm')) - readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm')) + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 7c35f5a..de6cc79 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,9 +15,10 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.Sonanza.parser import * from MarketPlaces.Kingdom.parser import * - from MarketPlaces.Classifier.classify_product import predict nError = 0 @@ -132,6 +133,10 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "DarkBazar": + rw = darkbazar_listing_parser(soup) + elif marketPlace == "Sonanza": + rw = sonanza_listing_parser(soup) elif marketPlace == "Kingdom": rw = kingdom_listing_parser(soup) else: @@ -168,6 +173,10 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "DarkBazar": + rmm = darkbazar_description_parser(soup) + elif marketPlace == "Sonanza": + rmm = sonanza_description_parser(soup) elif marketPlace == "Kingdom": rmm = kingdom_description_parser(soup) else: @@ -373,8 +382,8 @@ def new_parse(marketPlace, url, createLog): marketId = verifyMarketPlace(cur, marketPlace) if (marketId > 0): - readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm')) - readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm')) + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() From 746ec6ddd9a92b93fffc27d5a02ba2e130a3d559 Mon Sep 17 00:00:00 2001 From: chris Date: Mon, 30 Oct 2023 00:31:19 -0700 Subject: [PATCH 10/11] Added crawler and parser for Black Pyramid Marketplace --- MarketPlaces/BlackPyramid/crawler_selenium.py | 302 +++++++++------- MarketPlaces/BlackPyramid/parser.py | 341 +++++++++++------- 2 files changed, 391 insertions(+), 252 deletions(-) diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index b257c40..cf93b4a 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -1,9 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -BlackPyramid Forum Crawler (Selenium) -cannot use bc no links are used -kept in case issues are solved +BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver @@ -11,64 +9,101 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver import ActionChains +import selenium.webdriver.support.ui as uiClasses from PIL import Image + import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser +from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import traceback + +config = configparser.ConfigParser() +config.read('../../setup.ini') counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + # Opening tor beforehand gives "Tor exited during startup error" + # opentor() + + marketName = getMarketName() + driver = getAccess() + # Wait for website to load + input("Press ENTER when website has loaded") + if driver != 'down': try: login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) - closeDriver(driver) + closetor(driver) + + new_parse(marketName, baseURL, False) - new_parse(mktName, baseURL, True) + +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Login +def login(driver): + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") + # Username here + usernameBox.send_keys('ChipotleSteakBurrito') + passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") + # Password here + passwordBox.send_keys('BlackBeans') + + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + #WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + # (By.XPATH, '/html/body/div[2]/div[3]/div[3]/div[1]/div[3]/nav/ul/li[10]/a'))) # Returns the name of the website -#return: name of site in string type -def getMKTName(): +def getMarketName(): name = 'BlackPyramid' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' + url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' + return url # Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): +def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.close() + driver.quit() time.sleep(3) return @@ -76,8 +111,6 @@ def closeDriver(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -106,16 +139,13 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - driver.maximize_window() - return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() + input('Tor Connected. Press ENTER to continue\n') try: driver.get(url) return driver @@ -124,33 +154,9 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for login page - login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]') - login_link.click() # open tab with url - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('ChipotleSteakBurrito') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('BlackBeans') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]'))) - - - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -158,100 +164,148 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + global counter + from MarketPlaces.Initialization.markets_mining import CURRENT_DATE fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html')): + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + "(" + str(counter) + ")" + '.html' + else: + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html')): + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + "(" + str(counter) + ")" + '.html' + else: + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name +def goToPage(driver, page): + # hover over digital -> hacking tools + a = ActionChains(driver) + + # hover + digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") + time.sleep(1) + a.move_to_element(digitalB).perform() + print(digitalB) + + # delay for website to register hover + time.sleep(10) + + # click + #xpath = "//input[@value='" + page + "']" + xpath = "//input[@name='" + page + "']" + link = driver.find_element(By.XPATH, xpath) + time.sleep(1) + a.move_to_element(link).click().perform() + print(link) + + # wait for website to load + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/center/div[4]/div[1]/div[3]/article/div[1]/h1/a'))) + -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products def getInterestedLinks(): - links = [] - - # Hacking Guides - links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Exploits - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # botnets/malware - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # fraud software - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Other Tools - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Services - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + # h11 -> Hacking Tools + # g3 -> Guides, Hacking + # se3 -> Services, Hacking + # f6 -> Fraud software + links = ['h11','g3','se3','f6'] return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): print("Crawling the BlackPyramid market") - linksToCrawl = getInterestedLinks() + #linksToCrawl = getInterestedLinks() + #pages = ["Hacking Tools"] + pages = getInterestedLinks() + #visited = set(linksToCrawl) + initialTime = time.time() i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) + count = 0 + + for listing in pages: + #link = linksToCrawl[i] + print('Crawling :', listing) + try: - has_next_page = True - count = 0 + try: + goToPage(driver, listing) + except: + print("Try block 1") + driver.refresh() + time.sleep(5) + html = driver.page_source + savePage(html, listing) + has_next_page = True + currentPage = 1 + numberOfPages = 1 while has_next_page: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - + # get a list of urls for each listing list = productPages(html) for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: + print("Try block 2") driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() + savePage(driver.page_source, item) + # can't use the back button in dark pyramid + # driver.back() # comment out - break + # break # comment out - if count == 1: - break + # if count == 1: + # count = 0 + # break + # go to next page of market try: - clicker = driver.find_element(by=By.XPATH, value= - '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') - if clicker == "": + goToPage(driver, listing) + nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") + + if not nav.is_enabled(): + raise NoSuchElementException + try: + # block obscuring element + #element = driver.find_element(by=By.XPATH, value="//input[@class='tei39950693']") + #driver.execute_script("arguments[0].style.visibility='hidden'", element) + # select next page + pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) + print("pg options:", pgnum.options) + pgnum.select_by_index(currentPage) + numberOfPages = len(pgnum.options) + + # click button + pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") + pgbutton.click() + except Exception as e: + print(e) + raise NoSuchElementException + time.sleep(10) + html = driver.page_source + savePage(html, listing) + currentPage += 1 + if currentPage > numberOfPages: raise NoSuchElementException count += 1 @@ -259,39 +313,39 @@ def crawlForum(driver): has_next_page = False except Exception as e: - print(link, e) + traceback.print_exc() + print(listing, e) i += 1 - print("Crawling the BlackPyramid market done.") + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Dark Pyramid done successfully. Press ENTER to continue\n") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): - if 'search' in url: + if 'category=' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return blackpyramid_links_parser(soup) - + return BlackPyramid_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BlackPyramid .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") + +if __name__ == '__main__': + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 743466a..5224c1e 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'cern' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def BlackPyramid_description_parser(soup): # Fields to be parsed @@ -40,82 +40,71 @@ def darkfox_description_parser(soup): EURO = "-1" # 22 Product_EURO_SellingPrice # Finding Product Name - name = soup.find('h1').text + name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() + # product description + describe = soup.findAll('div', {'class': 'fer048953'})[1].text + describe = describe.replace('\n', ' ') + describe = describe.replace(",", "") + describe = describe.strip() + # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text + vendor = vendor.split(" ") + vendor = vendor[2][:-1] + vendor = vendor.replace('\n', ' ') + vendor = vendor.replace(",", "") + vendor = vendor.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span') + rating_num = rating_span.find('b').text + if rating_num != 'N/A': + rating = rating_num[0:3] # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") + success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1] + success = success_container.find('div').text + success = success.replace('"', '') + success = success.replace("\n", " ") + success = success.replace(",", "") success = success.strip() - bae = soup.find('div', {'class': "box"}).find_all('ul') - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') + USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text + USD = USD_text.split(',')[1] + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + container = soup.find('ul', {'class': 'bic03095'}) # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + sold_container = container.find('li') + sold_div = sold_container.findAll('div')[2] + sold = sold_div.find('b').next_sibling + sold = sold.replace('"', '') + sold = sold.replace("\n", " ") + sold = sold.replace(",", "") + sold = sold.strip() - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + # Finding the Product Quantity Available + left_container = container.find('li') + left_div = left_container.findAll('div')[3] + left = left_div.find('b').next_sibling + left = left.replace('"', '') + left = left.replace("\n", " ") + left = left.replace(",", "") + left = left.strip() + + # Finding number of reviews + positive = soup.find('span', {'class': 'ar04999324'}).text + neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text + negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text + review = int(positive) + int(neutral) + int(negative) # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -147,11 +136,11 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def BlackPyramid_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "BlackPyramid" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -169,80 +158,82 @@ def darkfox_listing_parser(soup): qLeft =[] # 15 Product_QuantityLeft shipFrom = [] # 16 Product_ShippedFrom shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions + rating_item = [] # 18 Product_Rating + vendor = [] # 19 Vendor + rating = [] # 20 Vendor_Rating + success = [] # 21 Vendor_Successful_Transactions href = [] # 23 Product_Links (Urls) - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('article', {"class": "product"}) + + # Some listing pages have an additional article section which is blank + if not listing[-1].findAll('a', href=True): + listing = listing[:-1] + # Populating the Number of Products nm = len(listing) - for a in listing: - bae = a.findAll('a', href=True) + for card in listing: + bae = card.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') + link = bae[2].get('href') link = cleanLink(link) href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[3].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding description + # 'recurisve = False' only searches direct children + desc = card.findChildren('div', recursive=False)[0] + desc = desc.findAll('div', recursive=False)[3].text + desc = desc.replace('\n', ' ') + desc = desc.replace(",", "") + desc = desc.strip() + describe.append(desc) + + # Finding Vendor Name + vendor_name = bae[4].find('span').text + vendor_name = vendor_name.split(' ')[1] + vendor_name = vendor_name.replace('\n', ' ') + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Category + cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text + cat = cat.replace("\n", "") + cat = cat.replace(",", "") + cat = cat.strip() + category.append(cat) + + bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1] + + # Finding amount left + left = bae.findAll('div', recursive=False)[1].text + left = left.replace("x", "") + left = left.replace('\n', ' ') + left = left.replace(",", "") + left = left.strip() + qLeft.append(left) + + # Finding amount sold + qsold = bae.findAll('div', recursive=False)[2].text + qsold = qsold.replace('\n', ' ') + qsold = qsold.replace("x", "") + qsold = qsold.replace(",", "") + qsold = qsold.strip() + sold.append(qsold) # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -255,7 +246,7 @@ def darkfox_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -269,23 +260,117 @@ def darkfox_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating, + addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page -def blackpyramid_links_parser(soup): +def BlackPyramid_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.findAll('article', {"class": "product"}) - for div in listing: - - link = div.find('a', {"class": "ah39063"})['href'] - href.append(link) - - return href \ No newline at end of file + for item in listing: + + container = item.find('a', {"class": "ah39063"}) + + if container: + link = item.find('a', {"class": "ah39063"})['href'] + href.append(link) + + return href + + +import glob +import os +import codecs +import shutil +import traceback + +if __name__ == '__main__': + nError = 0 + marketPlace = 'BlackPyramid' + + lines = [] # listing pages + lns = [] # description pages + detPage = {} + + ''' + # reading description pages + count = 0 + for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Description", '*.html')): + count += 1 + lns.append(fileDescription) + # if count > 5: + # break + + for index, line2 in enumerate(lns): + + print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) + + try: + html = codecs.open(line2.strip('\n'), encoding='utf8') + soup = BeautifulSoup(html, "html.parser") + html.close() + except: + + try: + html = open(line2.strip('\n')) + soup = BeautifulSoup(html, "html.parser") + html.close() + except: + + nError += 1 + print("There was a problem to read the file " + line2 + " in the Description section!") + # if createLog: + # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") + continue + + try: + print(BlackPyramid_description_parser(soup)) + except: + traceback.print_exc() + print("There was a problem to parse the file " + line2 + " in the Description section!") + ''' + # reading listing pages + count = 0 + for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Listing", '*.html')): + count += 1 + lines.append(fileListing) + # if count > 1: + # break + + for index, line1 in enumerate(lines): + + print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str( + index + 1) + " ... " + str(len(lines))) + + readError = False + try: + html = codecs.open(line1.strip('\n'), encoding='utf8') + soup = BeautifulSoup(html, "html.parser") + html.close() + except: + try: + html = open(line1.strip('\n')) + soup = BeautifulSoup(html, "html.parser") + html.close() + except: + print("There was a problem to read the file " + line1 + " in the Listing section!") + readError = True + + if not readError: + + parseError = False + try: + print(BlackPyramid_listing_parser(soup)) + except: + traceback.print_exc() + print("There was a problem to parse the file " + line1 + " in the listing section!") + parseError = True + + print("DONE") \ No newline at end of file From 2e34fe2e7d2447b9694dc8b1e805ed703f494df6 Mon Sep 17 00:00:00 2001 From: chris Date: Mon, 30 Oct 2023 00:33:48 -0700 Subject: [PATCH 11/11] Cleaned up some test comments in crawler and parser --- MarketPlaces/BlackPyramid/crawler_selenium.py | 4 - MarketPlaces/BlackPyramid/parser.py | 91 ------------------- 2 files changed, 95 deletions(-) diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index cf93b4a..6f7e45a 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -204,7 +204,6 @@ def goToPage(driver, page): time.sleep(10) # click - #xpath = "//input[@value='" + page + "']" xpath = "//input[@name='" + page + "']" link = driver.find_element(By.XPATH, xpath) time.sleep(1) @@ -286,9 +285,6 @@ def crawlForum(driver): if not nav.is_enabled(): raise NoSuchElementException try: - # block obscuring element - #element = driver.find_element(by=By.XPATH, value="//input[@class='tei39950693']") - #driver.execute_script("arguments[0].style.visibility='hidden'", element) # select next page pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) print("pg options:", pgnum.options) diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 5224c1e..4b45ee7 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -283,94 +283,3 @@ def BlackPyramid_links_parser(soup): href.append(link) return href - - -import glob -import os -import codecs -import shutil -import traceback - -if __name__ == '__main__': - nError = 0 - marketPlace = 'BlackPyramid' - - lines = [] # listing pages - lns = [] # description pages - detPage = {} - - ''' - # reading description pages - count = 0 - for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Description", '*.html')): - count += 1 - lns.append(fileDescription) - # if count > 5: - # break - - for index, line2 in enumerate(lns): - - print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) - - try: - html = codecs.open(line2.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - try: - html = open(line2.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - nError += 1 - print("There was a problem to read the file " + line2 + " in the Description section!") - # if createLog: - # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") - continue - - try: - print(BlackPyramid_description_parser(soup)) - except: - traceback.print_exc() - print("There was a problem to parse the file " + line2 + " in the Description section!") - ''' - # reading listing pages - count = 0 - for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Listing", '*.html')): - count += 1 - lines.append(fileListing) - # if count > 1: - # break - - for index, line1 in enumerate(lines): - - print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str( - index + 1) + " ... " + str(len(lines))) - - readError = False - try: - html = codecs.open(line1.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - try: - html = open(line1.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - print("There was a problem to read the file " + line1 + " in the Listing section!") - readError = True - - if not readError: - - parseError = False - try: - print(BlackPyramid_listing_parser(soup)) - except: - traceback.print_exc() - print("There was a problem to parse the file " + line1 + " in the listing section!") - parseError = True - - print("DONE") \ No newline at end of file