diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index cd99e29..5a5ac36 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -30,6 +30,7 @@ diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index f33b521..071abb0 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -225,7 +225,7 @@ def crawlForum(driver): has_next_topic_page = False # end of loop - for i in range(counter): + for j in range(counter): driver.back() # comment out diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index ec149ba..d13847a 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -173,28 +173,33 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacking Tools - links.append('https://altenens.is/forums/hacking-tools.469165/') - # hash cracking - links.append('https://altenens.is/forums/hash-cracking.469167/') - # phishing and spamming - links.append('https://altenens.is/forums/phishing-and-spamming.469223/') - # pentesting - links.append('https://altenens.is/forums/pentesting.469169/') - # cracking tools + # Hacking + links.append('https://altenens.is/forums/hacking.469162/') + # Hacking showoff + links.append('https://altenens.is/forums/hacking-showoff.469232/') + # Remote administration + links.append('https://altenens.is/forums/remote-administration.469161/') + # Cracking tools links.append('https://altenens.is/forums/cracking-tools.469204/') - # Cracking Tools + # Cracking tutorials links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/') + # Combo lists and configs + links.append('https://altenens.is/forums/combolists-and-configs.469206/') + # Programming + links.append('https://altenens.is/forums/programming.469239/') return links + # newest version of crawling def crawlForum(driver): print("Crawling the Altenens forum") linksToCrawl = getInterestedLinks() - for link in linksToCrawl: + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] print('Crawling :', link) try: has_next_page = True @@ -235,7 +240,7 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() # comment out @@ -243,7 +248,7 @@ def crawlForum(driver): # comment out if count == 1: - break + break try: link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') @@ -256,6 +261,7 @@ def crawlForum(driver): except Exception as e: print(link, e) + i += 1 print("Crawling the Altenens forum done.") diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py index 487863b..6c3bdc9 100644 --- a/Forums/BestCardingWorld/crawler_selenium.py +++ b/Forums/BestCardingWorld/crawler_selenium.py @@ -235,7 +235,7 @@ def crawlForum(driver): has_next_topic_page = False # end of loop - for i in range(counter): + for j in range(counter): driver.back() # comment out diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index caf4a9a..1e89751 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -181,18 +181,18 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # carding methods + # carding methods links.append('https://leaks.ws/forums/carding-methods.82/') - # # carding schools - # links.append('https://leaks.ws/forums/help-desk-carding-school.35/') - # # carding discussion - # links.append('https://leaks.ws/forums/carding-discussion-desk.58/') - # # carding tutorials - # links.append('https://leaks.ws/forums/carding-tutorials.13/') - # # carding tools and software - # links.append('https://leaks.ws/forums/carding-tools-softwares.10/') - # # exploits and cracking tools - # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') + # carding schools + links.append('https://leaks.ws/forums/help-desk-carding-school.35/') + # carding discussion + links.append('https://leaks.ws/forums/carding-discussion-desk.58/') + # carding tutorials + links.append('https://leaks.ws/forums/carding-tutorials.13/') + # carding tools and software + links.append('https://leaks.ws/forums/carding-tools-softwares.10/') + # exploits and cracking tools + links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') return links @@ -245,11 +245,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() # comment out - # break + break # comment out if count == 1: diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py index 022fbe1..f913243 100644 --- a/Forums/Cardingleaks/parser.py +++ b/Forums/Cardingleaks/parser.py @@ -109,7 +109,8 @@ def cardingleaks_listing_parser(soup: Tag): li = soup.find("h1", {"class": "p-title-value"}) board = cleanString(li.text.strip()) - thread_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + thread_list = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}).find_all('div', {"data-author": True}) + \ + soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) nm = len(thread_list) @@ -120,10 +121,13 @@ def cardingleaks_listing_parser(soup: Tag): thread_topic = thread.find("div", {"class": "structItem-title"}).text topic.append(cleanString(thread_topic.strip())) - author_icon = thread.find("a", {"class": "avatar avatar--s"}).find("img") - author_icon = author_icon.get('src') - author_icon = author_icon.split('base64,')[-1] - image_user.append(author_icon) + author_icon = thread.find("a", {"class": "avatar avatar--s"}) + if author_icon is not None: + author_icon = author_icon.find('img').get('src') + author_icon = author_icon.split('base64,')[-1] + image_user.append(author_icon) + else: + image_user.append('-1') thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text # Context text view count (i.e., 8.8K) to numerical (i.e., 8800) diff --git a/Forums/Classifier/test_classify.py b/Forums/Classifier/classify_test.py similarity index 100% rename from Forums/Classifier/test_classify.py rename to Forums/Classifier/classify_test.py diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index bcef5a8..40255ce 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -199,28 +199,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Beginner Programming + # Beginner Programming links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86') - # # Beginner Carding and Fraud - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91') - # # Beginner Hacking - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87') - # # Newbie - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84') - # # Beginner Hardware - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89') - # # Training Challenges - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96') + # Beginner Carding and Fraud + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91') + # Beginner Hacking + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87') + # Newbie + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84') + # Beginner Hardware + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89') + # Training Challenges + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96') # Darknet Discussions - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') - # # Public Leaks and Warez - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') - # # Sell - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44') + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') + # Public Leaks and Warez + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97') + # Sell + links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44') return links @@ -260,9 +256,9 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') @@ -275,15 +271,15 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index d725a98..60c513b 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -40,7 +40,6 @@ def cryptBB_description_parser(soup): # Finding the repeated tag that corresponds to the listing of posts - # try: posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( 'div', {"class": "post"}) @@ -48,6 +47,9 @@ def cryptBB_description_parser(soup): for ipost in posts: + if ipost.find('div', {"class": "deleted_post_author"}): + continue + # Finding a first level of the HTML page post_wrapper = ipost.find('span', {"class": "largetext"}) @@ -61,56 +63,49 @@ def cryptBB_description_parser(soup): smalltext = ipost.find('div', {"class": "post_author"}) - ''' - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue - ''' - - # CryptBB does have membergroup and postgroup + if smalltext is not None: - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() + # CryptBB does have membergroup and postgroup + membergroup = smalltext.find('div', {"class": "profile-rank"}) + postgroup = smalltext.find('div', {"class": "postgroup"}) + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup else: - membergroup = "-1" - status.append(cleanString(membergroup)) - - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() + if postgroup != None: + membergroup = postgroup.text.strip() + else: + membergroup = "-1" + status.append(cleanString(membergroup)) + + # Finding the interest of the author + # CryptBB does not have blurb + blurb = smalltext.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # CryptBB does have reputation + author_stats = smalltext.find('div', {"class": "author_statistics"}) + karma = author_stats.find('strong') + if karma != None: + karma = karma.text + karma = karma.replace("Community Rating: ", "") + karma = karma.replace("Karma: ", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + else: - karma = "-1" - reputation.append(cleanString(karma)) + status.append('-1') + interest.append('-1') + reputation.append('-1') # Getting here another good tag to find the post date, post content and users' signature @@ -120,25 +115,30 @@ def cryptBB_description_parser(soup): # dt = dt.strip().split() dt = dt.strip() day=date.today() - if "Yesterday" in dt: + if "Today" in dt: + today = day.strftime('%m-%d-%Y') + stime = dt.replace('Today,','').strip() + date_time_obj = today + ', '+stime + date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') + elif "Yesterday" in dt: yesterday = day - timedelta(days=1) yesterday = yesterday.strftime('%m-%d-%Y') stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime + date_time_obj = yesterday + ', '+stime date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hour ago" in dt or "hours ago" in dt: - day = day.strftime('%m-%d-%Y') + elif "ago" in dt: date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') else: date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') addDate.append(date_time_obj) # Finding the post inner = postarea.find('div', {"class": "post_body scaleimages"}) + quote = inner.find('blockquote') + if quote is not None: + quote.decompose() inner = inner.text.strip() post.append(cleanString(inner)) @@ -210,6 +210,10 @@ def cryptBB_listing_parser(soup): itopics = soup.find_all('tr',{"class": "inline_row"}) + # Counting how many topics + + nm = len(itopics) + for itopic in itopics: # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them @@ -225,10 +229,6 @@ def cryptBB_listing_parser(soup): image_user.append(-1) - # Counting how many topics we have found so far - - nm = len(topic) - # Adding the url to the list of urls try: link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') @@ -237,19 +237,24 @@ def cryptBB_listing_parser(soup): href.append(link) # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).find('a').text + ps = itopic.find('div', {"class":"author smalltext"}).text user = ps.strip() author.append(cleanString(user)) # Finding the number of replies columns = itopic.findChildren('td',recursive=False) replies = columns[3].text - - posts.append(cleanString(replies)) + if replies == '-': + posts.append('-1') + else: + posts.append(cleanString(replies)) # Finding the number of Views tview = columns[4].text - views.append(cleanString(tview)) + if tview == '-': + views.append('-1') + else: + views.append(cleanString(tview)) # If no information about when the topic was added, just assign "-1" to the variable diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index a7f37ea..f369347 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -157,16 +157,20 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacks - # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') - - # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor') - - # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet') - + # hacking + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') + # darknet and tor + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor') + # internet + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet') + # links links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links') - - + # programming + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming') + # knowledge and information + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information') + # other + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other') return links @@ -206,12 +210,12 @@ def crawlForum(driver: webdriver.Firefox): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: - page = "" # no next page so far may have some later on + page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') if page == "": raise NoSuchElementException counter += 1 @@ -219,15 +223,15 @@ def crawlForum(driver: webdriver.Firefox): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') @@ -248,14 +252,14 @@ def crawlForum(driver: webdriver.Firefox): # Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'index.php' in url and 'questions' not in url: + if 'http' not in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): - if 'questions' in url: + if 'http' in url: return True return False diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py index 995a7f0..0f2647f 100644 --- a/Forums/HiddenAnswers/parser.py +++ b/Forums/HiddenAnswers/parser.py @@ -42,14 +42,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S") addDate.append(datetime_obj) - question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text - status.append(cleanString(question_user_status.strip())) - - question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text - # Convert karma to pure numerical string - if question_user_karma.find("k") > -1: - question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) - reputation.append(cleanString(question_user_karma.strip())) + question_user_status = question.find("span", {"class": "qa-q-view-who-title"}) + if question_user_status is not None: + question_user_status = question_user_status.text + status.append(cleanString(question_user_status.strip())) + else: + status.append('-1') + + question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}) + if question_user_karma is not None: + question_user_karma = question_user_karma.text + # Convert karma to pure numerical string + if question_user_karma.find("k") > -1: + question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) + reputation.append(cleanString(question_user_karma.strip())) + else: + reputation.append('-1') question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text post.append(cleanString(question_content.strip())) @@ -88,14 +96,22 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text post.append(cleanString(post_data.strip())) - user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text - status.append(cleanString(user_reputations.strip())) + user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}) + if user_reputations is not None: + user_reputations = user_reputations.text + status.append(cleanString(user_reputations.strip())) + else: + status.append('-1') - karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text - # Convert karma to pure numerical string - if karma.find("k") > -1: - karma = str(float(karma.replace("k", "")) * 1000) - reputation.append(cleanString(karma.strip())) + karma = replies.find("span", {"class": "qa-a-item-who-points-data"}) + if karma is not None: + karma = karma.text + # Convert karma to pure numerical string + if karma.find("k") > -1: + karma = str(float(karma.replace("k", "")) * 1000) + reputation.append(cleanString(karma.strip())) + else: + reputation.append('-1') feedback.append("-1") sign.append("-1") @@ -139,8 +155,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): image_user = [] # 8 all user avatars used in each topic # Finding the board - literature = soup.find("div", {"class": "qa-main-heading"}).find("h1") - board = literature.text + board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text + board = board.replace('Recent questions in', '') + board = cleanString(board.strip()) queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"}) @@ -148,9 +165,9 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text topic.append(cleanString(topic_of_query.strip())) - image_user.append("-1") + image_user.append("-1") # qa-q-item-where - author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text + author = queries.find("span", {"class": "qa-q-item-who-data"}).text user.append(cleanString(author.strip())) num_answers = queries.find("span", {"class": "qa-a-count-data"}).text diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 311ac6c..4d68840 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -102,7 +102,7 @@ def opentor(): # main method if __name__ == '__main__': - # opentor() + opentor() # assignment from forumsList.txt forumsList = getForums() diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 267f887..ac1523f 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -212,7 +212,7 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript def move_file(filePath, createLog, logFile): source = filePath - destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath) try: shutil.move(source, destination, shutil.copy2) @@ -238,6 +238,9 @@ def new_parse(forum, url, createLog): from Forums.Initialization.forums_mining import config, CURRENT_DATE + global nError + nError = 0 + print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.") # Connecting to the database diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index 98b5517..58274ec 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -239,7 +239,7 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() # comment out diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 806f869..03f2367 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -250,7 +250,7 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() # comment out diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 7115d6c..7c15483 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -241,7 +241,7 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for i in range(counter): + for j in range(counter): driver.back() # comment out diff --git a/MarketPlaces/Classifier/test_classify.py b/MarketPlaces/Classifier/classify_test.py similarity index 100% rename from MarketPlaces/Classifier/test_classify.py rename to MarketPlaces/Classifier/classify_test.py diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c5af58b..b94723f 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -295,6 +295,9 @@ def new_parse(marketPlace, url, createLog): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + global nError + nError = 0 + print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.") # Connecting to the database