From 9fca859758f2621e6051cb85ed751427a23c6c67 Mon Sep 17 00:00:00 2001
From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com>
Date: Sun, 15 Oct 2023 14:00:28 -0700
Subject: [PATCH 01/11] Forums and Markets status.
---
.idea/DW_Pipeline_Test.iml | 2 +-
.idea/misc.xml | 2 +-
Forums/DB_Connection/db_connection.py | 23 +++++++++++++++++++
Forums/Initialization/forumsList.txt | 9 +-------
Forums/Initialization/prepare_parser.py | 6 +++++
MarketPlaces/DB_Connection/db_connection.py | 23 +++++++++++++++++++
MarketPlaces/Initialization/marketsList.txt | 2 +-
MarketPlaces/Initialization/prepare_parser.py | 6 +++++
setup.ini | 14 ++++++-----
9 files changed, 70 insertions(+), 17 deletions(-)
diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 8489f64..9f9af70 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -2,7 +2,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 11f1ea0..653c6ff 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py
index e4f6c5d..f0d4ed6 100644
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@@ -484,6 +484,24 @@ def create_posts(cur, row, forumId, topicId):
'dateinserted_post': row[8],
'postId': postId})
+def create_status(cur, forumId, date, status):
+
+ date = datetime.strptime(date, "%m%d%Y")
+
+ # checking if status already exists
+ sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
+ cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})
+
+ recset = cur.fetchall()
+ if recset:
+ sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
+ recset = {'status': status, 'forum_id': forumId, 'date_inserted': date}
+ else:
+ sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)"
+ recset = [forumId, date, status]
+
+ cur.execute(sql, recset)
+
def create_database(cur, con):
try:
@@ -496,6 +514,11 @@ def create_database(cur, con):
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
cur.execute(sql)
+ sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
+ "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
+ cur.execute(sql)
+
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt
index 9cfeb56..efa9686 100644
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@@ -1,8 +1 @@
-Altenens
-BestCardingWorld
-Cardingleaks
-CryptBB
-HiddenAnswers
-Libre
-OnniForums
-Procrax
\ No newline at end of file
+BestCardingWorld
\ No newline at end of file
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 91b662f..1f55319 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -341,6 +341,12 @@ def new_parse(forum, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
+ # registering the current forum status (up/down) in the database
+ forumId = verifyForum(cur, forum)
+ if (forumId > 0):
+ create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+ con.commit()
+
if createLog:
logFile.close()
diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py
index 8769869..4f439f0 100644
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@@ -401,6 +401,24 @@ def create_items(cur, row, marketId, vendorId):
return itemId
+def create_status(cur, marketId, date, status):
+
+ date = datetime.strptime(date, "%m%d%Y")
+
+ # checking if status already exists
+ sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
+ cur.execute(sql, {'market_id': marketId, 'date_inserted': date})
+
+ recset = cur.fetchall()
+ if recset:
+ sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
+ recset = {'status': status, 'market_id': marketId, 'date_inserted': date}
+ else:
+ sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)"
+ recset = [marketId, date, status]
+
+ cur.execute(sql, recset)
+
def create_database(cur, con):
try:
@@ -413,6 +431,11 @@ def create_database(cur, con):
sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)"
cur.execute(sql)
+ sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
+ "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
+ cur.execute(sql)
+
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt
index 87f811c..8b944c5 100644
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@@ -1 +1 @@
-ThiefWorld
\ No newline at end of file
+ViceCity
\ No newline at end of file
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index e075541..c56054e 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -363,6 +363,12 @@ def new_parse(marketPlace, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
+ # registering the current forum status (up/down) in the database
+ marketId = verifyMarketPlace(cur, marketPlace)
+ if (marketId > 0):
+ create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+ con.commit()
+
if createLog:
logFile.close()
diff --git a/setup.ini b/setup.ini
index 29997a6..883e495 100644
--- a/setup.ini
+++ b/setup.ini
@@ -1,17 +1,19 @@
[TOR]
-firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
-firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
-geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe
+firefox_binary_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe
+firefox_profile_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
+geckodriver_path = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test\selenium\geckodriver.exe
+
+"C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe"
[Project]
-project_directory = C:\calsyslab\Project\dw_pipeline_test
-shared_folder = \\VBoxSvr\Shared
+project_directory = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test
+shared_folder = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test
[PostgreSQL]
ip = localhost
username = postgres
-password = password
+password = 123
database = darkweb_markets_forums
[Encryption]
From 44561c509fb7ed1a1923ab5b25a328117d2228d1 Mon Sep 17 00:00:00 2001
From: Ericsson Santana Marin
Date: Sun, 15 Oct 2023 22:12:17 +0000
Subject: [PATCH 02/11] Revert "Forums and Markets status."
This reverts commit 9fca859758f2621e6051cb85ed751427a23c6c67
---
.idea/DW_Pipeline_Test.iml | 2 +-
.idea/misc.xml | 2 +-
Forums/DB_Connection/db_connection.py | 23 -------------------
Forums/Initialization/forumsList.txt | 9 +++++++-
Forums/Initialization/prepare_parser.py | 6 -----
MarketPlaces/DB_Connection/db_connection.py | 23 -------------------
MarketPlaces/Initialization/marketsList.txt | 2 +-
MarketPlaces/Initialization/prepare_parser.py | 6 -----
setup.ini | 14 +++++------
9 files changed, 17 insertions(+), 70 deletions(-)
diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 9f9af70..8489f64 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -2,7 +2,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 653c6ff..11f1ea0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py
index f0d4ed6..e4f6c5d 100644
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@@ -484,24 +484,6 @@ def create_posts(cur, row, forumId, topicId):
'dateinserted_post': row[8],
'postId': postId})
-def create_status(cur, forumId, date, status):
-
- date = datetime.strptime(date, "%m%d%Y")
-
- # checking if status already exists
- sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
- cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})
-
- recset = cur.fetchall()
- if recset:
- sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
- recset = {'status': status, 'forum_id': forumId, 'date_inserted': date}
- else:
- sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)"
- recset = [forumId, date, status]
-
- cur.execute(sql, recset)
-
def create_database(cur, con):
try:
@@ -514,11 +496,6 @@ def create_database(cur, con):
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
cur.execute(sql)
- sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
- "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
- "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
- cur.execute(sql)
-
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt
index efa9686..9cfeb56 100644
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@@ -1 +1,8 @@
-BestCardingWorld
\ No newline at end of file
+Altenens
+BestCardingWorld
+Cardingleaks
+CryptBB
+HiddenAnswers
+Libre
+OnniForums
+Procrax
\ No newline at end of file
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 1f55319..91b662f 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -341,12 +341,6 @@ def new_parse(forum, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
- # registering the current forum status (up/down) in the database
- forumId = verifyForum(cur, forum)
- if (forumId > 0):
- create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
- con.commit()
-
if createLog:
logFile.close()
diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py
index 4f439f0..8769869 100644
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@@ -401,24 +401,6 @@ def create_items(cur, row, marketId, vendorId):
return itemId
-def create_status(cur, marketId, date, status):
-
- date = datetime.strptime(date, "%m%d%Y")
-
- # checking if status already exists
- sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
- cur.execute(sql, {'market_id': marketId, 'date_inserted': date})
-
- recset = cur.fetchall()
- if recset:
- sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
- recset = {'status': status, 'market_id': marketId, 'date_inserted': date}
- else:
- sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)"
- recset = [marketId, date, status]
-
- cur.execute(sql, recset)
-
def create_database(cur, con):
try:
@@ -431,11 +413,6 @@ def create_database(cur, con):
sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)"
cur.execute(sql)
- sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
- "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
- "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
- cur.execute(sql)
-
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt
index 8b944c5..87f811c 100644
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@@ -1 +1 @@
-ViceCity
\ No newline at end of file
+ThiefWorld
\ No newline at end of file
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index c56054e..e075541 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -363,12 +363,6 @@ def new_parse(marketPlace, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
- # registering the current forum status (up/down) in the database
- marketId = verifyMarketPlace(cur, marketPlace)
- if (marketId > 0):
- create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
- con.commit()
-
if createLog:
logFile.close()
diff --git a/setup.ini b/setup.ini
index 883e495..29997a6 100644
--- a/setup.ini
+++ b/setup.ini
@@ -1,19 +1,17 @@
[TOR]
-firefox_binary_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe
-firefox_profile_path = C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
-geckodriver_path = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test\selenium\geckodriver.exe
-
-"C:\Users\santanamarin\OneDrive - Cal Poly Pomona\Desktop\Tor Browser\Browser\firefox.exe"
+firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
+firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
+geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe
[Project]
-project_directory = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test
-shared_folder = E:\Faculty\CSPUP\ResearchProjects\dw_pipeline_test
+project_directory = C:\calsyslab\Project\dw_pipeline_test
+shared_folder = \\VBoxSvr\Shared
[PostgreSQL]
ip = localhost
username = postgres
-password = 123
+password = password
database = darkweb_markets_forums
[Encryption]
From ed5a9193e19b1ef6fef6dd714920306b20c6dfd6 Mon Sep 17 00:00:00 2001
From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com>
Date: Sun, 15 Oct 2023 15:20:02 -0700
Subject: [PATCH 03/11] Forums and Markets status.
---
Forums/DB_Connection/db_connection.py | 23 +++++++++++++++++++
Forums/Initialization/prepare_parser.py | 6 +++++
MarketPlaces/DB_Connection/db_connection.py | 23 +++++++++++++++++++
MarketPlaces/Initialization/prepare_parser.py | 6 +++++
4 files changed, 58 insertions(+)
diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py
index e4f6c5d..f0d4ed6 100644
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@@ -484,6 +484,24 @@ def create_posts(cur, row, forumId, topicId):
'dateinserted_post': row[8],
'postId': postId})
+def create_status(cur, forumId, date, status):
+
+ date = datetime.strptime(date, "%m%d%Y")
+
+ # checking if status already exists
+ sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
+ cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})
+
+ recset = cur.fetchall()
+ if recset:
+ sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
+ recset = {'status': status, 'forum_id': forumId, 'date_inserted': date}
+ else:
+ sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)"
+ recset = [forumId, date, status]
+
+ cur.execute(sql, recset)
+
def create_database(cur, con):
try:
@@ -496,6 +514,11 @@ def create_database(cur, con):
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
cur.execute(sql)
+ sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
+ "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
+ cur.execute(sql)
+
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 91b662f..1f55319 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -341,6 +341,12 @@ def new_parse(forum, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
+ # registering the current forum status (up/down) in the database
+ forumId = verifyForum(cur, forum)
+ if (forumId > 0):
+ create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+ con.commit()
+
if createLog:
logFile.close()
diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py
index 8769869..4f439f0 100644
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@@ -401,6 +401,24 @@ def create_items(cur, row, marketId, vendorId):
return itemId
+def create_status(cur, marketId, date, status):
+
+ date = datetime.strptime(date, "%m%d%Y")
+
+ # checking if status already exists
+ sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
+ cur.execute(sql, {'market_id': marketId, 'date_inserted': date})
+
+ recset = cur.fetchall()
+ if recset:
+ sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
+ recset = {'status': status, 'market_id': marketId, 'date_inserted': date}
+ else:
+ sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)"
+ recset = [marketId, date, status]
+
+ cur.execute(sql, recset)
+
def create_database(cur, con):
try:
@@ -413,6 +431,11 @@ def create_database(cur, con):
sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)"
cur.execute(sql)
+ sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
+ "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
+ cur.execute(sql)
+
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index e075541..c56054e 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -363,6 +363,12 @@ def new_parse(marketPlace, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
+ # registering the current forum status (up/down) in the database
+ marketId = verifyMarketPlace(cur, marketPlace)
+ if (marketId > 0):
+ create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+ con.commit()
+
if createLog:
logFile.close()
From 61c85e05473cd81bb5ae181957cf6f899cdd04e9 Mon Sep 17 00:00:00 2001
From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com>
Date: Sun, 15 Oct 2023 15:38:24 -0700
Subject: [PATCH 04/11] Forums and Markets status.
---
MarketPlaces/Initialization/prepare_parser.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index c56054e..f3c792a 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -363,7 +363,7 @@ def new_parse(marketPlace, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
- # registering the current forum status (up/down) in the database
+ # registering the current market status (up/down) in the database
marketId = verifyMarketPlace(cur, marketPlace)
if (marketId > 0):
create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
From 07bfc887b18581f7ce02dab9befb2e10abc72899 Mon Sep 17 00:00:00 2001
From: westernmeadow
Date: Wed, 25 Oct 2023 16:30:57 -0700
Subject: [PATCH 05/11] don't use cleanLink
---
MarketPlaces/DarkBazar/crawler_selenium.py | 12 ++++++------
MarketPlaces/DarkBazar/parser.py | 1 -
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
index fdfb640..d351c42 100644
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@@ -216,12 +216,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py
index 9386d18..3d56e92 100644
--- a/MarketPlaces/DarkBazar/parser.py
+++ b/MarketPlaces/DarkBazar/parser.py
@@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup):
# Adding the url to the list of urls
link = bae[0].get('href')
- link = cleanLink(link)
href.append(link)
# Finding the Product
From f0003d4b386ecf7188ee01bb53a40bcd1264ed06 Mon Sep 17 00:00:00 2001
From: Helium
Date: Thu, 26 Oct 2023 13:14:25 -0700
Subject: [PATCH 06/11] kingdom completed for initial testing, might need to
create new account every once in a while. the og account was deleted
---
MarketPlaces/Initialization/prepare_parser.py | 6 +
MarketPlaces/Kingdom/crawler_selenium.py | 121 ++++--------------
MarketPlaces/Kingdom/parser.py | 21 ++-
3 files changed, 50 insertions(+), 98 deletions(-)
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index f3c792a..982995f 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -15,6 +15,8 @@ from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
from MarketPlaces.CityMarket.parser import *
+from MarketPlaces.Kingdom.parser import *
+
from MarketPlaces.Classifier.classify_product import predict
@@ -130,6 +132,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "CityMarket":
rw = city_listing_parser(soup)
+ elif marketPlace == "Kingdom":
+ rw = kingdom_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -164,6 +168,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "CityMarket":
rmm = city_description_parser(soup)
+ elif marketPlace == "Kingdom":
+ rmm = kingdom_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py
index e6b489f..5385150 100644
--- a/MarketPlaces/Kingdom/crawler_selenium.py
+++ b/MarketPlaces/Kingdom/crawler_selenium.py
@@ -1,4 +1,4 @@
-__author__ = 'DarkWeb'
+__author__ = 'Helium'
'''
Kingdom Market Crawler (Selenium)
@@ -35,55 +35,27 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion
# Opens Tor Browser, crawls the website
def startCrawling():
- # marketName = getMarketName()
+ mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
- captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
- # new_parse(marketName, False)
+ new_parse(mktName, baseURL, True)
+# Login using premade account credentials and do login captcha manually
+def login(driver):
-def captcha(driver):
- '''
- # wait for captcha page
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div[1]")))
-
- # save captcha to local
- driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot(
- r'..\Kingdom\captcha1.png')
-
- # This method will show image in any image viewer
- im = Image.open(r'..\Kingdom\captcha1.png')
- im.show()
-
- iframes = driver.find_elements(by=By.TAG_NAME, value='iframe')
-
- # ask user input captcha solution in terminal
- print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)")
- for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']:
- id = input(f"{order}: ")
- iframes[int(id)-1].click()
- '''
input("Press ENTER when CAPTCHA is completed\n")
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- # wait for login page
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
+ (By.XPATH, '//*[@id="login-form"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')
@@ -96,39 +68,17 @@ def login(driver):
select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))
select.select_by_visible_text('24 hours')
- '''
- # wait for captcha page show up
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="captcha"]')))
-
- # save captcha to local
- driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png')
-
- # This method will show image in any image viewer
- im = Image.open(r'..\Kingdom\captcha2.png')
- im.show()
-
- # wait until input space show up
- inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]')
-
- # ask user input captcha solution in terminal
- userIn = input("Enter solution: ")
-
- # send user solution into the input space
- inputBox.send_keys(userIn)
-
- # click the verify(submit) button
- driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click()
- '''
- input("Press ENTER when CAPTCHA is completed\n")
+ input("Press ENTER when CAPTCHA and DDOS is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div/div/div[3]/div[2]')))
+ (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]')))
+
+
# Returns the name of the website
-def getMarketName():
+def getMKTName():
name = 'Kingdom'
return name
@@ -236,30 +186,17 @@ def getInterestedLinks():
links = []
# Software and Malware
- links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32')
+ links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0')
# # Services
- # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32')
- # # Exploits
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')
- # # Tools
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')
- # # Malware
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')
- # # Cryptography
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')
- # # Others
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')
- # # Hacking Tutorials
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')
- # # Hacked Accounts and Database Dumps
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
- # # Android Moded pak
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
+ links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0')
+ # # guides and tutorials
+ links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0')
return links
def crawlForum(driver):
+
print("Crawling the Kingdom market")
linksToCrawl = getInterestedLinks()
@@ -281,6 +218,7 @@ def crawlForum(driver):
savePage(driver, html, link)
list = productPages(html)
+
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
@@ -290,18 +228,15 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # comment out
- break
-
- # comment out
- if count == 1:
- break
+ # # comment out
+ # break
+ #
+ # # comment out
+ # if count == 1:
+ # break
try:
- temp = driver.find_element(by=By.XPATH, value=
- '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul')
- next = temp.find_element_by_class_name("next")
- link = link.find_element_by_tag_name('a').get_attribute('href')
+ link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@@ -313,7 +248,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n")
+ print("Crawling the Kingdom market done.")
# Returns 'True' if the link is Topic link
@@ -325,7 +260,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link
def isListingLink(url):
- if 'category' in url:
+ if 'filter_category' in url:
return True
return False
@@ -333,10 +268,8 @@ def isListingLink(url):
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return kingdom_links_parser(soup)
def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
+ startCrawling()
\ No newline at end of file
diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py
index b1e05d5..abade27 100644
--- a/MarketPlaces/Kingdom/parser.py
+++ b/MarketPlaces/Kingdom/parser.py
@@ -1,4 +1,4 @@
-__author__ = 'DarkWeb'
+__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@@ -31,6 +31,8 @@ def kingdom_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
+ image = "-1" # 19 Product_Image
+ vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
@@ -95,7 +97,7 @@ def kingdom_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo)
+ BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
@@ -126,7 +128,9 @@ def kingdom_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
- href = [] # 20 Product_Links
+ image = [] # 20 Product_Image
+ image_vendor = [] # 21 Vendor_Image
+ href = [] # 22 Product_Links
listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False)
@@ -153,12 +157,20 @@ def kingdom_listing_parser(soup):
product = product.strip()
name.append(product)
+ # Finding Product Image
+ product_image = a.find('img')
+ product_image = product_image.get('src')
+ product_image = product_image.split('base64,')[-1]
+ image.append(product_image)
+
# Finding the Vendor
vendor_name = a.select_one('a[href^="/user"]').text
vendor_name = vendor_name.replace(",", " ").replace('/', '')
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
+ image_vendor.append("-1")
+
# Adding the url to the list of urls
link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href']
link = cleanLink(link)
@@ -169,7 +181,8 @@ def kingdom_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href,
+ image, image_vendor)
def kingdom_links_parser(soup):
From 1d091b944acb9f2f8e01b2a377cedab5201c9076 Mon Sep 17 00:00:00 2001
From: westernmeadow
Date: Fri, 27 Oct 2023 14:34:07 -0700
Subject: [PATCH 07/11] removed indent
---
MarketPlaces/DarkBazar/crawler_selenium.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
index d351c42..dac91b0 100644
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@@ -236,7 +236,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the DarkBazar market done.")
+ print("Crawling the DarkBazar market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
From b084d76d3ed6277bbb6e030828be564503321d0c Mon Sep 17 00:00:00 2001
From: westernmeadow
Date: Fri, 27 Oct 2023 15:13:58 -0700
Subject: [PATCH 08/11] listings, descriptions, and reference date
---
Forums/DB_Connection/db_connection.py | 19 +++++++++------
Forums/Initialization/prepare_parser.py | 8 +++++--
MarketPlaces/DB_Connection/db_connection.py | 24 ++++++++++++-------
MarketPlaces/Initialization/prepare_parser.py | 8 +++++--
4 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py
index f0d4ed6..dfdec49 100644
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@@ -3,7 +3,7 @@ __author__ = 'DarkWeb'
import psycopg2
import traceback
from Forums.Utilities.utilities import *
-
+from dateutil.relativedelta import relativedelta, FR
def connectDataBase():
@@ -484,21 +484,25 @@ def create_posts(cur, row, forumId, topicId):
'dateinserted_post': row[8],
'postId': postId})
-def create_status(cur, forumId, date, status):
+def create_status(cur, forumId, date, listings, descriptions, status):
date = datetime.strptime(date, "%m%d%Y")
+ # getting last Fridays a reference date
+ date_reference = date + relativedelta(weekday=FR(-1))
+
# checking if status already exists
sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})
recset = cur.fetchall()
if recset:
- sql = "Update forums_status set status = %(status)s where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
- recset = {'status': status, 'forum_id': forumId, 'date_inserted': date}
+ sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \
+ "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
+ recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date}
else:
- sql = "Insert into forums_status (forum_id, date_inserted, status) Values (%s, %s, %s)"
- recset = [forumId, date, status]
+ sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)"
+ recset = [forumId, date, listings, descriptions, status, date_reference]
cur.execute(sql, recset)
@@ -514,7 +518,8 @@ def create_database(cur, con):
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
cur.execute(sql)
- sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \
+ "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \
"CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
"CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
cur.execute(sql)
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 1f55319..31982fd 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -341,10 +341,14 @@ def new_parse(forum, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
- # registering the current forum status (up/down) in the database
+ # registering the current forum status (up/down) and the number of scraped pages in the database
forumId = verifyForum(cur, forum)
if (forumId > 0):
- create_status(cur, forumId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+
+ readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm'))
+ readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm'))
+
+ create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
con.commit()
if createLog:
diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py
index 4f439f0..2f3341a 100644
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@@ -4,7 +4,7 @@ import psycopg2
import traceback
import configparser
from MarketPlaces.Utilities.utilities import *
-
+from dateutil.relativedelta import relativedelta, FR
def connectDataBase():
@@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId):
if newItem:
+ # decode_decrypt_image_in_base64(row[20])
+
sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
@@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId):
recset = cur.fetchall()
- # decode_decrypt_image_in_base64(recset[0][20])
+ # decode_decrypt_image_in_base64(recset[0]['image_item'])
if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or
@@ -401,24 +403,27 @@ def create_items(cur, row, marketId, vendorId):
return itemId
-def create_status(cur, marketId, date, status):
+def create_status(cur, marketId, date, listings, descriptions, status):
date = datetime.strptime(date, "%m%d%Y")
+ # getting last Fridays a reference date
+ date_reference = date + relativedelta(weekday=FR(-1))
+
# checking if status already exists
sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
cur.execute(sql, {'market_id': marketId, 'date_inserted': date})
recset = cur.fetchall()
if recset:
- sql = "Update marketplaces_status set status = %(status)s where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
- recset = {'status': status, 'market_id': marketId, 'date_inserted': date}
+ sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \
+ "where market_id = %(market_id)s and date_inserted = %(date_inserted)s"
+ recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date}
else:
- sql = "Insert into marketplaces_status (market_id, date_inserted, status) Values (%s, %s, %s)"
- recset = [marketId, date, status]
+ sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)"
+ recset = [marketId, date, listings, descriptions, status, date_reference]
cur.execute(sql, recset)
-
def create_database(cur, con):
try:
@@ -431,7 +436,8 @@ def create_database(cur, con):
sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)"
cur.execute(sql)
- sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, status bit(1) NOT NULL, " \
+ sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \
+ "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \
"CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
"CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
cur.execute(sql)
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index 982995f..7c35f5a 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -369,10 +369,14 @@ def new_parse(marketPlace, url, createLog):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
- # registering the current market status (up/down) in the database
+ # registering the current forum status (up/down) and the number of scraped pages in the database
marketId = verifyMarketPlace(cur, marketPlace)
if (marketId > 0):
- create_status(cur, marketId, CURRENT_DATE, '1' if len(listings) > 0 else '0')
+
+ readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm'))
+ readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm'))
+
+ create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
con.commit()
if createLog:
From c976032cc40945cf8a660ab9551e6366a95f6927 Mon Sep 17 00:00:00 2001
From: westernmeadow
Date: Fri, 27 Oct 2023 15:35:31 -0700
Subject: [PATCH 09/11] small fixes
---
Forums/Initialization/prepare_parser.py | 4 ++--
MarketPlaces/Initialization/prepare_parser.py | 15 ++++++++++++---
2 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 31982fd..b86b5c6 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -345,8 +345,8 @@ def new_parse(forum, url, createLog):
forumId = verifyForum(cur, forum)
if (forumId > 0):
- readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm'))
- readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm'))
+ readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html'))
+ readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html'))
create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
con.commit()
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index 7c35f5a..de6cc79 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -15,9 +15,10 @@ from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
from MarketPlaces.CityMarket.parser import *
+from MarketPlaces.DarkBazar.parser import *
+from MarketPlaces.Sonanza.parser import *
from MarketPlaces.Kingdom.parser import *
-
from MarketPlaces.Classifier.classify_product import predict
nError = 0
@@ -132,6 +133,10 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "CityMarket":
rw = city_listing_parser(soup)
+ elif marketPlace == "DarkBazar":
+ rw = darkbazar_listing_parser(soup)
+ elif marketPlace == "Sonanza":
+ rw = sonanza_listing_parser(soup)
elif marketPlace == "Kingdom":
rw = kingdom_listing_parser(soup)
else:
@@ -168,6 +173,10 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "CityMarket":
rmm = city_description_parser(soup)
+ elif marketPlace == "DarkBazar":
+ rmm = darkbazar_description_parser(soup)
+ elif marketPlace == "Sonanza":
+ rmm = sonanza_description_parser(soup)
elif marketPlace == "Kingdom":
rmm = kingdom_description_parser(soup)
else:
@@ -373,8 +382,8 @@ def new_parse(marketPlace, url, createLog):
marketId = verifyMarketPlace(cur, marketPlace)
if (marketId > 0):
- readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.htm'))
- readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.htm'))
+ readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html'))
+ readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html'))
create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
con.commit()
From 746ec6ddd9a92b93fffc27d5a02ba2e130a3d559 Mon Sep 17 00:00:00 2001
From: chris
Date: Mon, 30 Oct 2023 00:31:19 -0700
Subject: [PATCH 10/11] Added crawler and parser for Black Pyramid Marketplace
---
MarketPlaces/BlackPyramid/crawler_selenium.py | 302 +++++++++-------
MarketPlaces/BlackPyramid/parser.py | 341 +++++++++++-------
2 files changed, 391 insertions(+), 252 deletions(-)
diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py
index b257c40..cf93b4a 100644
--- a/MarketPlaces/BlackPyramid/crawler_selenium.py
+++ b/MarketPlaces/BlackPyramid/crawler_selenium.py
@@ -1,9 +1,7 @@
-__author__ = 'Helium'
+__author__ = 'cern'
'''
-BlackPyramid Forum Crawler (Selenium)
-cannot use bc no links are used
-kept in case issues are solved
+BlackPyramid Market Crawler (Selenium)
'''
from selenium import webdriver
@@ -11,64 +9,101 @@ from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
-
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver import ActionChains
+import selenium.webdriver.support.ui as uiClasses
from PIL import Image
+
import urllib.parse as urlparse
import os, re, time
-from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser
+from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
+import traceback
+
+config = configparser.ConfigParser()
+config.read('../../setup.ini')
counter = 1
-baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/'
+baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1'
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
+# Opens Tor Browser, crawls the website
def startCrawling():
- mktName = getMKTName()
+ # Opening tor beforehand gives "Tor exited during startup error"
+ # opentor()
+
+ marketName = getMarketName()
+
driver = getAccess()
+ # Wait for website to load
+ input("Press ENTER when website has loaded")
+
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
- closeDriver(driver)
+ closetor(driver)
+
+ new_parse(marketName, baseURL, False)
- new_parse(mktName, baseURL, True)
+
+# Opens Tor Browser
+def opentor():
+ global pid
+ print("Connecting Tor...")
+ pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
+ pid = pro.pid
+ time.sleep(7.5)
+ input('Tor Connected. Press ENTER to continue\n')
+ return
+
+
+# Login
+def login(driver):
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']")
+ # Username here
+ usernameBox.send_keys('ChipotleSteakBurrito')
+ passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']")
+ # Password here
+ passwordBox.send_keys('BlackBeans')
+
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ #WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ # (By.XPATH, '/html/body/div[2]/div[3]/div[3]/div[1]/div[3]/nav/ul/li[10]/a')))
# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
+def getMarketName():
name = 'BlackPyramid'
return name
-# Return the base link of the website
-#return: url of base site in string type
+# Return the link of the website
def getFixedURL():
- url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/'
+ url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1'
+
return url
# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
+def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
- driver.close()
+ driver.quit()
time.sleep(3)
return
@@ -76,8 +111,6 @@ def closeDriver(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@@ -106,16 +139,13 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
- driver.maximize_window()
-
return driver
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
+ input('Tor Connected. Press ENTER to continue\n')
try:
driver.get(url)
return driver
@@ -124,33 +154,9 @@ def getAccess():
return 'down'
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- # wait for login page
- login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]')
- login_link.click() # open tab with url
-
- # entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
- # Username here
- usernameBox.send_keys('ChipotleSteakBurrito')
- passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
- # Password here
- passwordBox.send_keys('BlackBeans')
-
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]')))
-
-
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
+# Saves the crawled html page
+def savePage(page, url):
+ cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@@ -158,100 +164,148 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
+ global counter
+ from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
fileName = getNameFromURL(url)
if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+ if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html')):
+ fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + "(" + str(counter) + ")" + '.html'
+ else:
+ fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+ if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html')):
+ fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + "(" + str(counter) + ")" + '.html'
+ else:
+ fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
return fullPath
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
+# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
+ if name == '':
name = str(counter)
counter = counter + 1
return name
+def goToPage(driver, page):
+ # hover over digital -> hacking tools
+ a = ActionChains(driver)
+
+ # hover
+ digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a")
+ time.sleep(1)
+ a.move_to_element(digitalB).perform()
+ print(digitalB)
+
+ # delay for website to register hover
+ time.sleep(10)
+
+ # click
+ #xpath = "//input[@value='" + page + "']"
+ xpath = "//input[@name='" + page + "']"
+ link = driver.find_element(By.XPATH, xpath)
+ time.sleep(1)
+ a.move_to_element(link).click().perform()
+ print(link)
+
+ # wait for website to load
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/center/div[4]/div[1]/div[3]/article/div[1]/h1/a')))
+
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
def getInterestedLinks():
- links = []
-
- # Hacking Guides
- links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
- # # Exploits
- # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
- # # botnets/malware
- # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
- # # fraud software
- # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
- # # Other Tools
- # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
- # # Services
- # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
+ # h11 -> Hacking Tools
+ # g3 -> Guides, Hacking
+ # se3 -> Services, Hacking
+ # f6 -> Fraud software
+ links = ['h11','g3','se3','f6']
return links
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
def crawlForum(driver):
print("Crawling the BlackPyramid market")
- linksToCrawl = getInterestedLinks()
+ #linksToCrawl = getInterestedLinks()
+ #pages = ["Hacking Tools"]
+ pages = getInterestedLinks()
+ #visited = set(linksToCrawl)
+ initialTime = time.time()
i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
+ count = 0
+
+ for listing in pages:
+ #link = linksToCrawl[i]
+ print('Crawling :', listing)
+
try:
- has_next_page = True
- count = 0
+ try:
+ goToPage(driver, listing)
+ except:
+ print("Try block 1")
+ driver.refresh()
+ time.sleep(5)
+ html = driver.page_source
+ savePage(html, listing)
+ has_next_page = True
+ currentPage = 1
+ numberOfPages = 1
while has_next_page:
- try:
- clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a')
- clicker.click() # open tab with url
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
+ # get a list of urls for each listing
list = productPages(html)
for item in list:
+
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
+ print("Try block 2")
driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
+ savePage(driver.page_source, item)
+ # can't use the back button in dark pyramid
+ # driver.back()
# comment out
- break
+ # break
# comment out
- if count == 1:
- break
+ # if count == 1:
+ # count = 0
+ # break
+ # go to next page of market
try:
- clicker = driver.find_element(by=By.XPATH, value=
- '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]')
- if clicker == "":
+ goToPage(driver, listing)
+ nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']")
+
+ if not nav.is_enabled():
+ raise NoSuchElementException
+ try:
+ # block obscuring element
+ #element = driver.find_element(by=By.XPATH, value="//input[@class='tei39950693']")
+ #driver.execute_script("arguments[0].style.visibility='hidden'", element)
+ # select next page
+ pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']"))
+ print("pg options:", pgnum.options)
+ pgnum.select_by_index(currentPage)
+ numberOfPages = len(pgnum.options)
+
+ # click button
+ pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']")
+ pgbutton.click()
+ except Exception as e:
+ print(e)
+ raise NoSuchElementException
+ time.sleep(10)
+ html = driver.page_source
+ savePage(html, listing)
+ currentPage += 1
+ if currentPage > numberOfPages:
raise NoSuchElementException
count += 1
@@ -259,39 +313,39 @@ def crawlForum(driver):
has_next_page = False
except Exception as e:
- print(link, e)
+ traceback.print_exc()
+ print(listing, e)
i += 1
- print("Crawling the BlackPyramid market done.")
+ # finalTime = time.time()
+ # print finalTime - initialTime
+
+ input("Crawling Dark Pyramid done successfully. Press ENTER to continue\n")
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
+# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'products' in url:
+ if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
def isListingLink(url):
- if 'search' in url:
+ if 'category=' in url:
return True
return False
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
+# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return blackpyramid_links_parser(soup)
-
+ return BlackPyramid_links_parser(soup)
def crawler():
startCrawling()
- # print("Crawling and Parsing BlackPyramid .... DONE!")
+ # print("Crawling and Parsing BestCardingWorld .... DONE!")
+
+if __name__ == '__main__':
+ startCrawling()
\ No newline at end of file
diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py
index 743466a..5224c1e 100644
--- a/MarketPlaces/BlackPyramid/parser.py
+++ b/MarketPlaces/BlackPyramid/parser.py
@@ -1,4 +1,4 @@
-__author__ = 'Helium'
+__author__ = 'cern'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
-def darkfox_description_parser(soup):
+def BlackPyramid_description_parser(soup):
# Fields to be parsed
@@ -40,82 +40,71 @@ def darkfox_description_parser(soup):
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
- name = soup.find('h1').text
+ name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
+ # product description
+ describe = soup.findAll('div', {'class': 'fer048953'})[1].text
+ describe = describe.replace('\n', ' ')
+ describe = describe.replace(",", "")
+ describe = describe.strip()
+
# Finding Vendor
- vendor = soup.find('h3').find('a').text.strip()
+ vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text
+ vendor = vendor.split(" ")
+ vendor = vendor[2][:-1]
+ vendor = vendor.replace('\n', ' ')
+ vendor = vendor.replace(",", "")
+ vendor = vendor.strip()
# Finding Vendor Rating
- rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
+ rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span')
+ rating_num = rating_span.find('b').text
+ if rating_num != 'N/A':
+ rating = rating_num[0:3]
# Finding Successful Transactions
- success = soup.find('h3').text
- success = success.replace("Vendor: ", "")
- success = success.replace(vendor, "")
- success = success.replace("(", "")
- success = success.replace(")", "")
+ success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1]
+ success = success_container.find('div').text
+ success = success.replace('"', '')
+ success = success.replace("\n", " ")
+ success = success.replace(",", "")
success = success.strip()
- bae = soup.find('div', {'class': "box"}).find_all('ul')
-
# Finding Prices
- USD = bae[1].find('strong').text.strip()
-
- li = bae[2].find_all('li')
+ USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text
+ USD = USD_text.split(',')[1]
+ USD = USD.replace('\n', ' ')
+ USD = USD.replace(",", "")
+ USD = USD.strip()
- # Finding Escrow
- escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
-
- # Finding the Product Category
- category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
-
- # Finding the Product Quantity Available
- left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
+ container = soup.find('ul', {'class': 'bic03095'})
# Finding Number Sold
- sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
-
- li = bae[3].find_all('li')
-
- # Finding Shipment Information (Origin)
- if "Ships from:" in li[-2].text:
- shipFrom = li[-2].text
- shipFrom = shipFrom.replace("Ships from: ", "")
- # shipFrom = shipFrom.replace(",", "")
- shipFrom = shipFrom.strip()
-
- # Finding Shipment Information (Destination)
- shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
- shipTo = shipTo.replace("Ships to: ", "")
- shipTo = shipTo.strip()
- if "certain countries" in shipTo:
- countries = ""
- tags = li[-1].find_all('span', {'class': "tag"})
- for tag in tags:
- country = tag.text.strip()
- countries += country + ", "
- shipTo = countries.strip(", ")
-
- # Finding the Product description
- describe = soup.find('div', {'class': "pre-line"}).text
- describe = describe.replace("\n", " ")
- describe = describe.strip()
+ sold_container = container.find('li')
+ sold_div = sold_container.findAll('div')[2]
+ sold = sold_div.find('b').next_sibling
+ sold = sold.replace('"', '')
+ sold = sold.replace("\n", " ")
+ sold = sold.replace(",", "")
+ sold = sold.strip()
- '''# Finding the Number of Product Reviews
- tag = soup.findAll(text=re.compile('Reviews'))
- for index in tag:
- reviews = index
- par = reviews.find('(')
- if par >=0:
- reviews = reviews.replace("Reviews (","")
- reviews = reviews.replace(")","")
- reviews = reviews.split(",")
- review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
- else :
- review = "-1"'''
+ # Finding the Product Quantity Available
+ left_container = container.find('li')
+ left_div = left_container.findAll('div')[3]
+ left = left_div.find('b').next_sibling
+ left = left.replace('"', '')
+ left = left.replace("\n", " ")
+ left = left.replace(",", "")
+ left = left.strip()
+
+ # Finding number of reviews
+ positive = soup.find('span', {'class': 'ar04999324'}).text
+ neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text
+ negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text
+ review = int(positive) + int(neutral) + int(negative)
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
@@ -147,11 +136,11 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
-def darkfox_listing_parser(soup):
+def BlackPyramid_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
- mktName = "DarkFox" # 0 Marketplace_Name
+ mktName = "BlackPyramid" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
@@ -169,80 +158,82 @@ def darkfox_listing_parser(soup):
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
- vendor = [] # 18 Vendor
- rating = [] # 19 Vendor_Rating
- success = [] # 20 Vendor_Successful_Transactions
+ rating_item = [] # 18 Product_Rating
+ vendor = [] # 19 Vendor
+ rating = [] # 20 Vendor_Rating
+ success = [] # 21 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
- listing = soup.findAll('div', {"class": "card"})
+ listing = soup.findAll('article', {"class": "product"})
+
+ # Some listing pages have an additional article section which is blank
+ if not listing[-1].findAll('a', href=True):
+ listing = listing[:-1]
+
# Populating the Number of Products
nm = len(listing)
- for a in listing:
- bae = a.findAll('a', href=True)
+ for card in listing:
+ bae = card.findAll('a', href=True)
# Adding the url to the list of urls
- link = bae[0].get('href')
+ link = bae[2].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
- product = bae[1].find('p').text
+ product = bae[3].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
- bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
-
- if len(bae) >= 5:
- # Finding Prices
- price = bae[0].text
- ud = price.replace(" USD", " ")
- # u = ud.replace("$","")
- u = ud.replace(",", "")
- u = u.strip()
- USD.append(u)
- # bc = (prc[1]).strip(' BTC')
- # BTC.append(bc)
-
- # Finding the Vendor
- vendor_name = bae[1].find('a').text
- vendor_name = vendor_name.replace(",", "")
- vendor_name = vendor_name.strip()
- vendor.append(vendor_name)
-
- # Finding the Category
- cat = bae[2].find('small').text
- cat = cat.replace("Category: ", "")
- cat = cat.replace(",", "")
- cat = cat.strip()
- category.append(cat)
-
- # Finding Number Sold and Quantity Left
- num = bae[3].text
- num = num.replace("Sold: ", "")
- num = num.strip()
- sold.append(num)
-
- quant = bae[4].find('small').text
- quant = quant.replace("In stock: ", "")
- quant = quant.strip()
- qLeft.append(quant)
-
- # Finding Successful Transactions
- freq = bae[1].text
- freq = freq.replace(vendor_name, "")
- freq = re.sub(r'Vendor Level \d+', "", freq)
- freq = freq.replace("(", "")
- freq = freq.replace(")", "")
- freq = freq.strip()
- success.append(freq)
+ # Finding description
+ # 'recurisve = False' only searches direct children
+ desc = card.findChildren('div', recursive=False)[0]
+ desc = desc.findAll('div', recursive=False)[3].text
+ desc = desc.replace('\n', ' ')
+ desc = desc.replace(",", "")
+ desc = desc.strip()
+ describe.append(desc)
+
+ # Finding Vendor Name
+ vendor_name = bae[4].find('span').text
+ vendor_name = vendor_name.split(' ')[1]
+ vendor_name = vendor_name.replace('\n', ' ')
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Category
+ cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text
+ cat = cat.replace("\n", "")
+ cat = cat.replace(",", "")
+ cat = cat.strip()
+ category.append(cat)
+
+ bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1]
+
+ # Finding amount left
+ left = bae.findAll('div', recursive=False)[1].text
+ left = left.replace("x", "")
+ left = left.replace('\n', ' ')
+ left = left.replace(",", "")
+ left = left.strip()
+ qLeft.append(left)
+
+ # Finding amount sold
+ qsold = bae.findAll('div', recursive=False)[2].text
+ qsold = qsold.replace('\n', ' ')
+ qsold = qsold.replace("x", "")
+ qsold = qsold.replace(",", "")
+ qsold = qsold.strip()
+ sold.append(qsold)
# Searching for CVE and MS categories
- cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
@@ -255,7 +246,7 @@ def darkfox_listing_parser(soup):
cveValue=cee
CVE.append(cveValue)
- ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
@@ -269,23 +260,117 @@ def darkfox_listing_parser(soup):
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
- BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
+ return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating,
+ addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
-def blackpyramid_links_parser(soup):
+def BlackPyramid_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('article', {"class": "product"})
- for div in listing:
-
- link = div.find('a', {"class": "ah39063"})['href']
- href.append(link)
-
- return href
\ No newline at end of file
+ for item in listing:
+
+ container = item.find('a', {"class": "ah39063"})
+
+ if container:
+ link = item.find('a', {"class": "ah39063"})['href']
+ href.append(link)
+
+ return href
+
+
+import glob
+import os
+import codecs
+import shutil
+import traceback
+
+if __name__ == '__main__':
+ nError = 0
+ marketPlace = 'BlackPyramid'
+
+ lines = [] # listing pages
+ lns = [] # description pages
+ detPage = {}
+
+ '''
+ # reading description pages
+ count = 0
+ for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Description", '*.html')):
+ count += 1
+ lns.append(fileDescription)
+ # if count > 5:
+ # break
+
+ for index, line2 in enumerate(lns):
+
+ print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
+
+ try:
+ html = codecs.open(line2.strip('\n'), encoding='utf8')
+ soup = BeautifulSoup(html, "html.parser")
+ html.close()
+ except:
+
+ try:
+ html = open(line2.strip('\n'))
+ soup = BeautifulSoup(html, "html.parser")
+ html.close()
+ except:
+
+ nError += 1
+ print("There was a problem to read the file " + line2 + " in the Description section!")
+ # if createLog:
+ # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n")
+ continue
+
+ try:
+ print(BlackPyramid_description_parser(soup))
+ except:
+ traceback.print_exc()
+ print("There was a problem to parse the file " + line2 + " in the Description section!")
+ '''
+ # reading listing pages
+ count = 0
+ for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Listing", '*.html')):
+ count += 1
+ lines.append(fileListing)
+ # if count > 1:
+ # break
+
+ for index, line1 in enumerate(lines):
+
+ print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(
+ index + 1) + " ... " + str(len(lines)))
+
+ readError = False
+ try:
+ html = codecs.open(line1.strip('\n'), encoding='utf8')
+ soup = BeautifulSoup(html, "html.parser")
+ html.close()
+ except:
+ try:
+ html = open(line1.strip('\n'))
+ soup = BeautifulSoup(html, "html.parser")
+ html.close()
+ except:
+ print("There was a problem to read the file " + line1 + " in the Listing section!")
+ readError = True
+
+ if not readError:
+
+ parseError = False
+ try:
+ print(BlackPyramid_listing_parser(soup))
+ except:
+ traceback.print_exc()
+ print("There was a problem to parse the file " + line1 + " in the listing section!")
+ parseError = True
+
+ print("DONE")
\ No newline at end of file
From 2e34fe2e7d2447b9694dc8b1e805ed703f494df6 Mon Sep 17 00:00:00 2001
From: chris
Date: Mon, 30 Oct 2023 00:33:48 -0700
Subject: [PATCH 11/11] Cleaned up some test comments in crawler and parser
---
MarketPlaces/BlackPyramid/crawler_selenium.py | 4 -
MarketPlaces/BlackPyramid/parser.py | 91 -------------------
2 files changed, 95 deletions(-)
diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py
index cf93b4a..6f7e45a 100644
--- a/MarketPlaces/BlackPyramid/crawler_selenium.py
+++ b/MarketPlaces/BlackPyramid/crawler_selenium.py
@@ -204,7 +204,6 @@ def goToPage(driver, page):
time.sleep(10)
# click
- #xpath = "//input[@value='" + page + "']"
xpath = "//input[@name='" + page + "']"
link = driver.find_element(By.XPATH, xpath)
time.sleep(1)
@@ -286,9 +285,6 @@ def crawlForum(driver):
if not nav.is_enabled():
raise NoSuchElementException
try:
- # block obscuring element
- #element = driver.find_element(by=By.XPATH, value="//input[@class='tei39950693']")
- #driver.execute_script("arguments[0].style.visibility='hidden'", element)
# select next page
pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']"))
print("pg options:", pgnum.options)
diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py
index 5224c1e..4b45ee7 100644
--- a/MarketPlaces/BlackPyramid/parser.py
+++ b/MarketPlaces/BlackPyramid/parser.py
@@ -283,94 +283,3 @@ def BlackPyramid_links_parser(soup):
href.append(link)
return href
-
-
-import glob
-import os
-import codecs
-import shutil
-import traceback
-
-if __name__ == '__main__':
- nError = 0
- marketPlace = 'BlackPyramid'
-
- lines = [] # listing pages
- lns = [] # description pages
- detPage = {}
-
- '''
- # reading description pages
- count = 0
- for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Description", '*.html')):
- count += 1
- lns.append(fileDescription)
- # if count > 5:
- # break
-
- for index, line2 in enumerate(lns):
-
- print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
-
- try:
- html = codecs.open(line2.strip('\n'), encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
-
- try:
- html = open(line2.strip('\n'))
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
-
- nError += 1
- print("There was a problem to read the file " + line2 + " in the Description section!")
- # if createLog:
- # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n")
- continue
-
- try:
- print(BlackPyramid_description_parser(soup))
- except:
- traceback.print_exc()
- print("There was a problem to parse the file " + line2 + " in the Description section!")
- '''
- # reading listing pages
- count = 0
- for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Listing", '*.html')):
- count += 1
- lines.append(fileListing)
- # if count > 1:
- # break
-
- for index, line1 in enumerate(lines):
-
- print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(
- index + 1) + " ... " + str(len(lines)))
-
- readError = False
- try:
- html = codecs.open(line1.strip('\n'), encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
- try:
- html = open(line1.strip('\n'))
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
- print("There was a problem to read the file " + line1 + " in the Listing section!")
- readError = True
-
- if not readError:
-
- parseError = False
- try:
- print(BlackPyramid_listing_parser(soup))
- except:
- traceback.print_exc()
- print("There was a problem to parse the file " + line1 + " in the listing section!")
- parseError = True
-
- print("DONE")
\ No newline at end of file