diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index cd99e29..5a5ac36 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -30,6 +30,7 @@
+
diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py
index f33b521..071abb0 100644
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/AbyssForum/crawler_selenium.py
@@ -225,7 +225,7 @@ def crawlForum(driver):
has_next_topic_page = False
# end of loop
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py
index ec149ba..d13847a 100644
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@@ -173,28 +173,33 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Hacking Tools
- links.append('https://altenens.is/forums/hacking-tools.469165/')
- # hash cracking
- links.append('https://altenens.is/forums/hash-cracking.469167/')
- # phishing and spamming
- links.append('https://altenens.is/forums/phishing-and-spamming.469223/')
- # pentesting
- links.append('https://altenens.is/forums/pentesting.469169/')
- # cracking tools
+ # Hacking
+ links.append('https://altenens.is/forums/hacking.469162/')
+ # Hacking showoff
+ links.append('https://altenens.is/forums/hacking-showoff.469232/')
+ # Remote administration
+ links.append('https://altenens.is/forums/remote-administration.469161/')
+ # Cracking tools
links.append('https://altenens.is/forums/cracking-tools.469204/')
- # Cracking Tools
+ # Cracking tutorials
links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/')
+ # Combo lists and configs
+ links.append('https://altenens.is/forums/combolists-and-configs.469206/')
+ # Programming
+ links.append('https://altenens.is/forums/programming.469239/')
return links
+
# newest version of crawling
def crawlForum(driver):
print("Crawling the Altenens forum")
linksToCrawl = getInterestedLinks()
- for link in linksToCrawl:
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
@@ -235,7 +240,7 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
@@ -243,7 +248,7 @@ def crawlForum(driver):
# comment out
if count == 1:
- break
+ break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
@@ -256,6 +261,7 @@ def crawlForum(driver):
except Exception as e:
print(link, e)
+ i += 1
print("Crawling the Altenens forum done.")
diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py
index 487863b..6c3bdc9 100644
--- a/Forums/BestCardingWorld/crawler_selenium.py
+++ b/Forums/BestCardingWorld/crawler_selenium.py
@@ -235,7 +235,7 @@ def crawlForum(driver):
has_next_topic_page = False
# end of loop
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py
index caf4a9a..1e89751 100644
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ b/Forums/Cardingleaks/crawler_selenium.py
@@ -181,18 +181,18 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # carding methods
+ # carding methods
links.append('https://leaks.ws/forums/carding-methods.82/')
- # # carding schools
- # links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
- # # carding discussion
- # links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
- # # carding tutorials
- # links.append('https://leaks.ws/forums/carding-tutorials.13/')
- # # carding tools and software
- # links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
- # # exploits and cracking tools
- # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
+ # carding schools
+ links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
+ # carding discussion
+ links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
+ # carding tutorials
+ links.append('https://leaks.ws/forums/carding-tutorials.13/')
+ # carding tools and software
+ links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
+ # exploits and cracking tools
+ links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
return links
@@ -245,11 +245,11 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
- # break
+ break
# comment out
if count == 1:
diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py
index 022fbe1..f913243 100644
--- a/Forums/Cardingleaks/parser.py
+++ b/Forums/Cardingleaks/parser.py
@@ -109,7 +109,8 @@ def cardingleaks_listing_parser(soup: Tag):
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
- thread_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
+ thread_list = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}).find_all('div', {"data-author": True}) + \
+ soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
nm = len(thread_list)
@@ -120,10 +121,13 @@ def cardingleaks_listing_parser(soup: Tag):
thread_topic = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_topic.strip()))
- author_icon = thread.find("a", {"class": "avatar avatar--s"}).find("img")
- author_icon = author_icon.get('src')
- author_icon = author_icon.split('base64,')[-1]
- image_user.append(author_icon)
+ author_icon = thread.find("a", {"class": "avatar avatar--s"})
+ if author_icon is not None:
+ author_icon = author_icon.find('img').get('src')
+ author_icon = author_icon.split('base64,')[-1]
+ image_user.append(author_icon)
+ else:
+ image_user.append('-1')
thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text
# Context text view count (i.e., 8.8K) to numerical (i.e., 8800)
diff --git a/Forums/Classifier/test_classify.py b/Forums/Classifier/classify_test.py
similarity index 100%
rename from Forums/Classifier/test_classify.py
rename to Forums/Classifier/classify_test.py
diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py
index bcef5a8..40255ce 100644
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@@ -199,28 +199,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # Beginner Programming
+ # Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
- # # Beginner Carding and Fraud
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
- # # Beginner Hacking
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
- # # Newbie
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
- # # Beginner Hardware
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
- # # Training Challenges
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
+ # Beginner Carding and Fraud
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
+ # Beginner Hacking
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
+ # Newbie
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
+ # Beginner Hardware
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
+ # Training Challenges
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
# Darknet Discussions
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
- # # Public Leaks and Warez
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
- # # Hacked Accounts and Database Dumps
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
- # # Android Moded pak
- # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
- # # Sell
- # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
+ # Public Leaks and Warez
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
+ # Sell
+ links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
return links
@@ -260,9 +256,9 @@ def crawlForum(driver):
driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
- # comment out
- if counter == 2:
- break
+ # # comment out
+ # if counter == 2:
+ # break
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
@@ -275,15 +271,15 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
- # comment out
- # break
-
- # comment out
- if count == 1:
- break
+ # # comment out
+ # break
+ #
+ # # comment out
+ # if count == 1:
+ # break
try:
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
index d725a98..60c513b 100644
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@@ -40,7 +40,6 @@ def cryptBB_description_parser(soup):
# Finding the repeated tag that corresponds to the listing of posts
- # try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
@@ -48,6 +47,9 @@ def cryptBB_description_parser(soup):
for ipost in posts:
+ if ipost.find('div', {"class": "deleted_post_author"}):
+ continue
+
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
@@ -61,56 +63,49 @@ def cryptBB_description_parser(soup):
smalltext = ipost.find('div', {"class": "post_author"})
- '''
- # Testing here two possibilities to find this status and combine them
- if ipost.find('div', {"class": "deleted_post_author"}):
- status.append(-1)
- interest.append(-1)
- reputation.append(-1)
- addDate.append(-1)
- post.append("THIS POST HAS BEEN REMOVED!")
- sign.append(-1)
- feedback.append(-1)
- continue
- '''
-
- # CryptBB does have membergroup and postgroup
+ if smalltext is not None:
- membergroup = smalltext.find('div', {"class": "profile-rank"})
- postgroup = smalltext.find('div', {"class": "postgroup"})
- if membergroup != None:
- membergroup = membergroup.text.strip()
- if postgroup != None:
- postgroup = postgroup.text.strip()
- membergroup = membergroup + " - " + postgroup
- else:
- if postgroup != None:
- membergroup = postgroup.text.strip()
+ # CryptBB does have membergroup and postgroup
+ membergroup = smalltext.find('div', {"class": "profile-rank"})
+ postgroup = smalltext.find('div', {"class": "postgroup"})
+ if membergroup != None:
+ membergroup = membergroup.text.strip()
+ if postgroup != None:
+ postgroup = postgroup.text.strip()
+ membergroup = membergroup + " - " + postgroup
else:
- membergroup = "-1"
- status.append(cleanString(membergroup))
-
- # Finding the interest of the author
- # CryptBB does not have blurb
- blurb = smalltext.find('li', {"class": "blurb"})
- if blurb != None:
- blurb = blurb.text.strip()
- else:
- blurb = "-1"
- interest.append(cleanString(blurb))
-
- # Finding the reputation of the user
- # CryptBB does have reputation
- author_stats = smalltext.find('div', {"class": "author_statistics"})
- karma = author_stats.find('strong')
- if karma != None:
- karma = karma.text
- karma = karma.replace("Community Rating: ", "")
- karma = karma.replace("Karma: ", "")
- karma = karma.strip()
+ if postgroup != None:
+ membergroup = postgroup.text.strip()
+ else:
+ membergroup = "-1"
+ status.append(cleanString(membergroup))
+
+ # Finding the interest of the author
+ # CryptBB does not have blurb
+ blurb = smalltext.find('li', {"class": "blurb"})
+ if blurb != None:
+ blurb = blurb.text.strip()
+ else:
+ blurb = "-1"
+ interest.append(cleanString(blurb))
+
+ # Finding the reputation of the user
+ # CryptBB does have reputation
+ author_stats = smalltext.find('div', {"class": "author_statistics"})
+ karma = author_stats.find('strong')
+ if karma != None:
+ karma = karma.text
+ karma = karma.replace("Community Rating: ", "")
+ karma = karma.replace("Karma: ", "")
+ karma = karma.strip()
+ else:
+ karma = "-1"
+ reputation.append(cleanString(karma))
+
else:
- karma = "-1"
- reputation.append(cleanString(karma))
+ status.append('-1')
+ interest.append('-1')
+ reputation.append('-1')
# Getting here another good tag to find the post date, post content and users' signature
@@ -120,25 +115,30 @@ def cryptBB_description_parser(soup):
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
- if "Yesterday" in dt:
+ if "Today" in dt:
+ today = day.strftime('%m-%d-%Y')
+ stime = dt.replace('Today,','').strip()
+ date_time_obj = today + ', '+stime
+ date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
+ elif "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
- date_time_obj = yesterday+ ', '+stime
+ date_time_obj = yesterday + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
- elif "hour ago" in dt or "hours ago" in dt:
- day = day.strftime('%m-%d-%Y')
+ elif "ago" in dt:
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
- stime = date_time_obj.strftime('%b %d, %Y')
- sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
+ quote = inner.find('blockquote')
+ if quote is not None:
+ quote.decompose()
inner = inner.text.strip()
post.append(cleanString(inner))
@@ -210,6 +210,10 @@ def cryptBB_listing_parser(soup):
itopics = soup.find_all('tr',{"class": "inline_row"})
+ # Counting how many topics
+
+ nm = len(itopics)
+
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
@@ -225,10 +229,6 @@ def cryptBB_listing_parser(soup):
image_user.append(-1)
- # Counting how many topics we have found so far
-
- nm = len(topic)
-
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
@@ -237,19 +237,24 @@ def cryptBB_listing_parser(soup):
href.append(link)
# Finding the author of the topic
- ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
+ ps = itopic.find('div', {"class":"author smalltext"}).text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
-
- posts.append(cleanString(replies))
+ if replies == '-':
+ posts.append('-1')
+ else:
+ posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
- views.append(cleanString(tview))
+ if tview == '-':
+ views.append('-1')
+ else:
+ views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py
index a7f37ea..f369347 100644
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@@ -157,16 +157,20 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Hacks
- # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
-
- # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
-
- # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
-
+ # hacking
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
+ # darknet and tor
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
+ # internet
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
+ # links
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links')
-
-
+ # programming
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming')
+ # knowledge and information
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information')
+ # other
+ links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other')
return links
@@ -206,12 +210,12 @@ def crawlForum(driver: webdriver.Firefox):
driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
- # comment out
- if counter == 2:
- break
+ # # comment out
+ # if counter == 2:
+ # break
try:
- page = "" # no next page so far may have some later on
+ page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@@ -219,15 +223,15 @@ def crawlForum(driver: webdriver.Firefox):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
- # comment out
- # break
-
- # comment out
- if count == 1:
- break
+ # # comment out
+ # break
+ #
+ # # comment out
+ # if count == 1:
+ # break
try:
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
@@ -248,14 +252,14 @@ def crawlForum(driver: webdriver.Firefox):
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'index.php' in url and 'questions' not in url:
+ if 'http' not in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
- if 'questions' in url:
+ if 'http' in url:
return True
return False
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 995a7f0..0f2647f 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -42,14 +42,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
addDate.append(datetime_obj)
- question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
- status.append(cleanString(question_user_status.strip()))
-
- question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
- # Convert karma to pure numerical string
- if question_user_karma.find("k") > -1:
- question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
- reputation.append(cleanString(question_user_karma.strip()))
+ question_user_status = question.find("span", {"class": "qa-q-view-who-title"})
+ if question_user_status is not None:
+ question_user_status = question_user_status.text
+ status.append(cleanString(question_user_status.strip()))
+ else:
+ status.append('-1')
+
+ question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"})
+ if question_user_karma is not None:
+ question_user_karma = question_user_karma.text
+ # Convert karma to pure numerical string
+ if question_user_karma.find("k") > -1:
+ question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
+ reputation.append(cleanString(question_user_karma.strip()))
+ else:
+ reputation.append('-1')
question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
post.append(cleanString(question_content.strip()))
@@ -88,14 +96,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
post.append(cleanString(post_data.strip()))
- user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
- status.append(cleanString(user_reputations.strip()))
+ user_reputations = replies.find("span", {"class", "qa-a-item-who-title"})
+ if user_reputations is not None:
+ user_reputations = user_reputations.text
+ status.append(cleanString(user_reputations.strip()))
+ else:
+ status.append('-1')
- karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
- # Convert karma to pure numerical string
- if karma.find("k") > -1:
- karma = str(float(karma.replace("k", "")) * 1000)
- reputation.append(cleanString(karma.strip()))
+ karma = replies.find("span", {"class": "qa-a-item-who-points-data"})
+ if karma is not None:
+ karma = karma.text
+ # Convert karma to pure numerical string
+ if karma.find("k") > -1:
+ karma = str(float(karma.replace("k", "")) * 1000)
+ reputation.append(cleanString(karma.strip()))
+ else:
+ reputation.append('-1')
feedback.append("-1")
sign.append("-1")
@@ -139,8 +155,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
image_user = [] # 8 all user avatars used in each topic
# Finding the board
- literature = soup.find("div", {"class": "qa-main-heading"}).find("h1")
- board = literature.text
+ board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text
+ board = board.replace('Recent questions in', '')
+ board = cleanString(board.strip())
queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"})
@@ -148,9 +165,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
topic.append(cleanString(topic_of_query.strip()))
- image_user.append("-1")
+ image_user.append("-1") # qa-q-item-where
- author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
+ author = queries.find("span", {"class": "qa-q-item-who-data"}).text
user.append(cleanString(author.strip()))
num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py
index 311ac6c..4d68840 100644
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@@ -102,7 +102,7 @@ def opentor():
# main method
if __name__ == '__main__':
- # opentor()
+ opentor()
# assignment from forumsList.txt
forumsList = getForums()
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 267f887..ac1523f 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -212,7 +212,7 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
def move_file(filePath, createLog, logFile):
source = filePath
- destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
+ destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
try:
shutil.move(source, destination, shutil.copy2)
@@ -238,6 +238,9 @@ def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
+ global nError
+ nError = 0
+
print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.")
# Connecting to the database
diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py
index 98b5517..58274ec 100644
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@@ -239,7 +239,7 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py
index 806f869..03f2367 100644
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@@ -250,7 +250,7 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py
index 7115d6c..7c15483 100644
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@@ -241,7 +241,7 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- for i in range(counter):
+ for j in range(counter):
driver.back()
# comment out
diff --git a/MarketPlaces/Classifier/test_classify.py b/MarketPlaces/Classifier/classify_test.py
similarity index 100%
rename from MarketPlaces/Classifier/test_classify.py
rename to MarketPlaces/Classifier/classify_test.py
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index c5af58b..b94723f 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -295,6 +295,9 @@ def new_parse(marketPlace, url, createLog):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
+ global nError
+ nError = 0
+
print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.")
# Connecting to the database