Browse Source

finished fully running completed CryptBB and HiddenAnswers

main
westernmeadow 1 year ago
parent
commit
2dc18e23f4
18 changed files with 210 additions and 171 deletions
  1. +1
    -0
      .idea/DW_Pipeline_Test.iml
  2. +1
    -1
      Forums/AbyssForum/crawler_selenium.py
  3. +19
    -13
      Forums/Altenens/crawler_selenium.py
  4. +1
    -1
      Forums/BestCardingWorld/crawler_selenium.py
  5. +13
    -13
      Forums/Cardingleaks/crawler_selenium.py
  6. +9
    -5
      Forums/Cardingleaks/parser.py
  7. +0
    -0
      Forums/Classifier/classify_test.py
  8. +26
    -30
      Forums/CryptBB/crawler_selenium.py
  9. +67
    -62
      Forums/CryptBB/parser.py
  10. +26
    -22
      Forums/HiddenAnswers/crawler_selenium.py
  11. +36
    -19
      Forums/HiddenAnswers/parser.py
  12. +1
    -1
      Forums/Initialization/forums_mining.py
  13. +4
    -1
      Forums/Initialization/prepare_parser.py
  14. +1
    -1
      Forums/Libre/crawler_selenium.py
  15. +1
    -1
      Forums/OnniForums/crawler_selenium.py
  16. +1
    -1
      Forums/Procrax/crawler_selenium.py
  17. +0
    -0
      MarketPlaces/Classifier/classify_test.py
  18. +3
    -0
      MarketPlaces/Initialization/prepare_parser.py

+ 1
- 0
.idea/DW_Pipeline_Test.iml View File

@ -30,6 +30,7 @@
<option value="$MODULE_DIR$/MarketPlaces/DarkBazar" /> <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
<option value="$MODULE_DIR$/MarketPlaces/AnonMarket" /> <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
<option value="$MODULE_DIR$/MarketPlaces/Tor2door" /> <option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
<option value="$MODULE_DIR$/MarketPlaces/MetaVerseMarket" />
</list> </list>
</option> </option>
</component> </component>

+ 1
- 1
Forums/AbyssForum/crawler_selenium.py View File

@ -225,7 +225,7 @@ def crawlForum(driver):
has_next_topic_page = False has_next_topic_page = False
# end of loop # end of loop
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out


+ 19
- 13
Forums/Altenens/crawler_selenium.py View File

@ -173,28 +173,33 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacking Tools
links.append('https://altenens.is/forums/hacking-tools.469165/')
# hash cracking
links.append('https://altenens.is/forums/hash-cracking.469167/')
# phishing and spamming
links.append('https://altenens.is/forums/phishing-and-spamming.469223/')
# pentesting
links.append('https://altenens.is/forums/pentesting.469169/')
# cracking tools
# Hacking
links.append('https://altenens.is/forums/hacking.469162/')
# Hacking showoff
links.append('https://altenens.is/forums/hacking-showoff.469232/')
# Remote administration
links.append('https://altenens.is/forums/remote-administration.469161/')
# Cracking tools
links.append('https://altenens.is/forums/cracking-tools.469204/') links.append('https://altenens.is/forums/cracking-tools.469204/')
# Cracking Tools
# Cracking tutorials
links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/') links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/')
# Combo lists and configs
links.append('https://altenens.is/forums/combolists-and-configs.469206/')
# Programming
links.append('https://altenens.is/forums/programming.469239/')
return links return links
# newest version of crawling # newest version of crawling
def crawlForum(driver): def crawlForum(driver):
print("Crawling the Altenens forum") print("Crawling the Altenens forum")
linksToCrawl = getInterestedLinks() linksToCrawl = getInterestedLinks()
for link in linksToCrawl:
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link) print('Crawling :', link)
try: try:
has_next_page = True has_next_page = True
@ -235,7 +240,7 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out
@ -243,7 +248,7 @@ def crawlForum(driver):
# comment out # comment out
if count == 1: if count == 1:
break
break
try: try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
@ -256,6 +261,7 @@ def crawlForum(driver):
except Exception as e: except Exception as e:
print(link, e) print(link, e)
i += 1
print("Crawling the Altenens forum done.") print("Crawling the Altenens forum done.")


+ 1
- 1
Forums/BestCardingWorld/crawler_selenium.py View File

@ -235,7 +235,7 @@ def crawlForum(driver):
has_next_topic_page = False has_next_topic_page = False
# end of loop # end of loop
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out


+ 13
- 13
Forums/Cardingleaks/crawler_selenium.py View File

@ -181,18 +181,18 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # carding methods
# carding methods
links.append('https://leaks.ws/forums/carding-methods.82/') links.append('https://leaks.ws/forums/carding-methods.82/')
# # carding schools
# links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# # carding discussion
# links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# # carding tutorials
# links.append('https://leaks.ws/forums/carding-tutorials.13/')
# # carding tools and software
# links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# # exploits and cracking tools
# links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
# carding schools
links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# carding discussion
links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# carding tutorials
links.append('https://leaks.ws/forums/carding-tutorials.13/')
# carding tools and software
links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# exploits and cracking tools
links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
return links return links
@ -245,11 +245,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out
# break
break
# comment out # comment out
if count == 1: if count == 1:


+ 9
- 5
Forums/Cardingleaks/parser.py View File

@ -109,7 +109,8 @@ def cardingleaks_listing_parser(soup: Tag):
li = soup.find("h1", {"class": "p-title-value"}) li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip()) board = cleanString(li.text.strip())
thread_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
thread_list = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}).find_all('div', {"data-author": True}) + \
soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
nm = len(thread_list) nm = len(thread_list)
@ -120,10 +121,13 @@ def cardingleaks_listing_parser(soup: Tag):
thread_topic = thread.find("div", {"class": "structItem-title"}).text thread_topic = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_topic.strip())) topic.append(cleanString(thread_topic.strip()))
author_icon = thread.find("a", {"class": "avatar avatar--s"}).find("img")
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
image_user.append(author_icon)
author_icon = thread.find("a", {"class": "avatar avatar--s"})
if author_icon is not None:
author_icon = author_icon.find('img').get('src')
author_icon = author_icon.split('base64,')[-1]
image_user.append(author_icon)
else:
image_user.append('-1')
thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text
# Context text view count (i.e., 8.8K) to numerical (i.e., 8800) # Context text view count (i.e., 8.8K) to numerical (i.e., 8800)


Forums/Classifier/test_classify.py → Forums/Classifier/classify_test.py View File


+ 26
- 30
Forums/CryptBB/crawler_selenium.py View File

@ -199,28 +199,24 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # Beginner Programming
# Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86') links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
# # Beginner Carding and Fraud
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# # Beginner Hacking
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# # Newbie
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
# # Beginner Hardware
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
# # Training Challenges
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
# Beginner Carding and Fraud
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# Beginner Hacking
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# Newbie
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
# Beginner Hardware
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
# Training Challenges
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
# Darknet Discussions # Darknet Discussions
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
# # Public Leaks and Warez
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
# # Hacked Accounts and Database Dumps
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
# # Android Moded pak
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
# # Sell
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
# Public Leaks and Warez
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
# Sell
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
return links return links
@ -260,9 +256,9 @@ def crawlForum(driver):
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# # comment out
# if counter == 2:
# break
try: try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
@ -275,15 +271,15 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try: try:
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')


+ 67
- 62
Forums/CryptBB/parser.py View File

@ -40,7 +40,6 @@ def cryptBB_description_parser(soup):
# Finding the repeated tag that corresponds to the listing of posts # Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"}) 'div', {"class": "post"})
@ -48,6 +47,9 @@ def cryptBB_description_parser(soup):
for ipost in posts: for ipost in posts:
if ipost.find('div', {"class": "deleted_post_author"}):
continue
# Finding a first level of the HTML page # Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"}) post_wrapper = ipost.find('span', {"class": "largetext"})
@ -61,56 +63,49 @@ def cryptBB_description_parser(soup):
smalltext = ipost.find('div', {"class": "post_author"}) smalltext = ipost.find('div', {"class": "post_author"})
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
if smalltext is not None:
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else: else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
else: else:
karma = "-1"
reputation.append(cleanString(karma))
status.append('-1')
interest.append('-1')
reputation.append('-1')
# Getting here another good tag to find the post date, post content and users' signature # Getting here another good tag to find the post date, post content and users' signature
@ -120,25 +115,30 @@ def cryptBB_description_parser(soup):
# dt = dt.strip().split() # dt = dt.strip().split()
dt = dt.strip() dt = dt.strip()
day=date.today() day=date.today()
if "Yesterday" in dt:
if "Today" in dt:
today = day.strftime('%m-%d-%Y')
stime = dt.replace('Today,','').strip()
date_time_obj = today + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "Yesterday" in dt:
yesterday = day - timedelta(days=1) yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y') yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip() stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = yesterday + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hour ago" in dt or "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
elif "ago" in dt:
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else: else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj) addDate.append(date_time_obj)
# Finding the post # Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"}) inner = postarea.find('div', {"class": "post_body scaleimages"})
quote = inner.find('blockquote')
if quote is not None:
quote.decompose()
inner = inner.text.strip() inner = inner.text.strip()
post.append(cleanString(inner)) post.append(cleanString(inner))
@ -210,6 +210,10 @@ def cryptBB_listing_parser(soup):
itopics = soup.find_all('tr',{"class": "inline_row"}) itopics = soup.find_all('tr',{"class": "inline_row"})
# Counting how many topics
nm = len(itopics)
for itopic in itopics: for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
@ -225,10 +229,6 @@ def cryptBB_listing_parser(soup):
image_user.append(-1) image_user.append(-1)
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls # Adding the url to the list of urls
try: try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
@ -237,19 +237,24 @@ def cryptBB_listing_parser(soup):
href.append(link) href.append(link)
# Finding the author of the topic # Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
ps = itopic.find('div', {"class":"author smalltext"}).text
user = ps.strip() user = ps.strip()
author.append(cleanString(user)) author.append(cleanString(user))
# Finding the number of replies # Finding the number of replies
columns = itopic.findChildren('td',recursive=False) columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text replies = columns[3].text
posts.append(cleanString(replies))
if replies == '-':
posts.append('-1')
else:
posts.append(cleanString(replies))
# Finding the number of Views # Finding the number of Views
tview = columns[4].text tview = columns[4].text
views.append(cleanString(tview))
if tview == '-':
views.append('-1')
else:
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable # If no information about when the topic was added, just assign "-1" to the variable


+ 26
- 22
Forums/HiddenAnswers/crawler_selenium.py View File

@ -157,16 +157,20 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacks
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
# hacking
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
# darknet and tor
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
# internet
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
# links
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links') links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links')
# programming
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming')
# knowledge and information
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information')
# other
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other')
return links return links
@ -206,12 +210,12 @@ def crawlForum(driver: webdriver.Firefox):
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# # comment out
# if counter == 2:
# break
try: try:
page = "" # no next page so far may have some later on
page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
if page == "": if page == "":
raise NoSuchElementException raise NoSuchElementException
counter += 1 counter += 1
@ -219,15 +223,15 @@ def crawlForum(driver: webdriver.Firefox):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try: try:
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
@ -248,14 +252,14 @@ def crawlForum(driver: webdriver.Firefox):
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link
def isDescriptionLink(url): def isDescriptionLink(url):
if 'index.php' in url and 'questions' not in url:
if 'http' not in url:
return True return True
return False return False
# Returns True if the link is a listingPage link # Returns True if the link is a listingPage link
def isListingLink(url): def isListingLink(url):
if 'questions' in url:
if 'http' in url:
return True return True
return False return False


+ 36
- 19
Forums/HiddenAnswers/parser.py View File

@ -42,14 +42,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S") datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
addDate.append(datetime_obj) addDate.append(datetime_obj)
question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
status.append(cleanString(question_user_status.strip()))
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
# Convert karma to pure numerical string
if question_user_karma.find("k") > -1:
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
reputation.append(cleanString(question_user_karma.strip()))
question_user_status = question.find("span", {"class": "qa-q-view-who-title"})
if question_user_status is not None:
question_user_status = question_user_status.text
status.append(cleanString(question_user_status.strip()))
else:
status.append('-1')
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"})
if question_user_karma is not None:
question_user_karma = question_user_karma.text
# Convert karma to pure numerical string
if question_user_karma.find("k") > -1:
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
reputation.append(cleanString(question_user_karma.strip()))
else:
reputation.append('-1')
question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
post.append(cleanString(question_content.strip())) post.append(cleanString(question_content.strip()))
@ -88,14 +96,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
post.append(cleanString(post_data.strip())) post.append(cleanString(post_data.strip()))
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
status.append(cleanString(user_reputations.strip()))
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"})
if user_reputations is not None:
user_reputations = user_reputations.text
status.append(cleanString(user_reputations.strip()))
else:
status.append('-1')
karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
# Convert karma to pure numerical string
if karma.find("k") > -1:
karma = str(float(karma.replace("k", "")) * 1000)
reputation.append(cleanString(karma.strip()))
karma = replies.find("span", {"class": "qa-a-item-who-points-data"})
if karma is not None:
karma = karma.text
# Convert karma to pure numerical string
if karma.find("k") > -1:
karma = str(float(karma.replace("k", "")) * 1000)
reputation.append(cleanString(karma.strip()))
else:
reputation.append('-1')
feedback.append("-1") feedback.append("-1")
sign.append("-1") sign.append("-1")
@ -139,8 +155,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
image_user = [] # 8 all user avatars used in each topic image_user = [] # 8 all user avatars used in each topic
# Finding the board # Finding the board
literature = soup.find("div", {"class": "qa-main-heading"}).find("h1")
board = literature.text
board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text
board = board.replace('Recent questions in', '')
board = cleanString(board.strip())
queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"}) queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"})
@ -148,9 +165,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
topic.append(cleanString(topic_of_query.strip())) topic.append(cleanString(topic_of_query.strip()))
image_user.append("-1")
image_user.append("-1") # qa-q-item-where
author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
author = queries.find("span", {"class": "qa-q-item-who-data"}).text
user.append(cleanString(author.strip())) user.append(cleanString(author.strip()))
num_answers = queries.find("span", {"class": "qa-a-count-data"}).text num_answers = queries.find("span", {"class": "qa-a-count-data"}).text


+ 1
- 1
Forums/Initialization/forums_mining.py View File

@ -102,7 +102,7 @@ def opentor():
# main method # main method
if __name__ == '__main__': if __name__ == '__main__':
# opentor()
opentor()
# assignment from forumsList.txt # assignment from forumsList.txt
forumsList = getForums() forumsList = getForums()


+ 4
- 1
Forums/Initialization/prepare_parser.py View File

@ -212,7 +212,7 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
def move_file(filePath, createLog, logFile): def move_file(filePath, createLog, logFile):
source = filePath source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
try: try:
shutil.move(source, destination, shutil.copy2) shutil.move(source, destination, shutil.copy2)
@ -238,6 +238,9 @@ def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import config, CURRENT_DATE from Forums.Initialization.forums_mining import config, CURRENT_DATE
global nError
nError = 0
print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.") print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.")
# Connecting to the database # Connecting to the database


+ 1
- 1
Forums/Libre/crawler_selenium.py View File

@ -239,7 +239,7 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out


+ 1
- 1
Forums/OnniForums/crawler_selenium.py View File

@ -250,7 +250,7 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out


+ 1
- 1
Forums/Procrax/crawler_selenium.py View File

@ -241,7 +241,7 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for i in range(counter):
for j in range(counter):
driver.back() driver.back()
# comment out # comment out


MarketPlaces/Classifier/test_classify.py → MarketPlaces/Classifier/classify_test.py View File


+ 3
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -295,6 +295,9 @@ def new_parse(marketPlace, url, createLog):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
global nError
nError = 0
print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.") print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.")
# Connecting to the database # Connecting to the database


Loading…
Cancel
Save