Skip to content Skip to sidebar Skip to footer

Its All About The Logic: Findall Posts & The Corresponding Threads - On Vbulletin

vbulletin Main Goal At the end we have all the threads (and discourses) where our demo-user is involved. (Note: This means that we should keep in mind a nice presentation of the ga

Solution 1:

See if this gets you started.

Can make some functions that will pull out the thread ids, using the postIds. then iterate through the thread Id pages and parse the data. I'm not really going to spend too much more time on this. You could possibly use the comments in the html as well to pull out some of the sections, but I think this is more or less the thought process you are looking for.

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re


headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
userId = 4793defget_user_stats(userId):
    url = 'https://forums.sagetv.com/forums/member.php'
    payload = {'u':f'{userId}'}
    
    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    stats_data = {}
    stats = soup.find('fieldset',{'class':"statistics_group"}).find_all('li')
    for each in stats:
        values = each.text.replace(',','').split(':')
        iflen(values) == 2:
            key, value = ''.join(values[0].split()), float(values[1])
            stats_data[key] = value
    return stats_data


defget_searchId(userId):
    url = 'https://forums.sagetv.com/forums/search.php?do=finduser&u=4793'
    payload = {'do':'finduser',
               'u':f'{userId}'}    
    
    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    searchId = soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).find_next('a')['href'].split('searchid=')[-1].split('&')[0]
    return searchId

defget_page_threadIds(threadId_list, soup):
    postIds = soup.find_all('table',{'id':re.compile("^post")})
    for each in postIds:
        a_s = each.find_all('a')
        for alpha in a_s:
            if't='in alpha['href']:
                threadId = alpha['href'].split('t=')[-1]

        if threadId notin threadId_list:
            threadId_list.append(threadId)
    return threadId_list


defget_all_threadIds(searchId):
    threadId_list = []
    url = 'https://forums.sagetv.com/forums/search.php'
    payload = {'searchid':'%s' %searchId,
               'pp':'200'}

    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    total_pages = int(soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).text.split('of ')[-1])
    
    threadId_list = get_page_threadIds(threadId_list, soup)
    for page inrange(2, total_pages+1):
        payload.update({'page': '%s' %page})
        response = requests.get(url, headers=headers, params=payload)
        soup = BeautifulSoup(response.text, 'html.parser')
        threadId_list += get_page_threadIds(threadId_list, soup)
    returnlist(set(threadId_list))
        
        
        
stats = get_user_stats(userId)
searchId = get_searchId(userId)
threadId_list = get_all_threadIds(searchId)   



rows = []
for threadId in threadId_list:
    url = 'https://forums.sagetv.com/forums/showthread.php'
    payload = {'t':'%s' %threadId,
               'pp':'40',
               'page':'1'}
    
    response = requests.get(url, headers=headers, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        total_pages = int(soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).text.split('of ')[-1])
    except:
        total_pages=1for page inrange(1,total_pages+1):
        payload.update({'page':'%s' %page})
        response = requests.get(url, headers=headers, params=payload)
        soup = BeautifulSoup(response.text, 'html.parser')
        discussion = soup.find('td',{'class':'navbar'}).text.strip()
        posts = soup.find_all('table',{'id':re.compile("^post")})
        for post in posts:
            dateStr = post.find('td',{'class':'thead'}).text.split()
            postNo = dateStr[0]
            dateStr = ' '.join(dateStr[1:])
            
            postername = post.find('a',{'class':'bigusername'}).text
            joinDate = post.find('div', text=re.compile("^Join Date:")).text.split('Join Date:')[-1].strip()
            try:
                location = post.find('div', text=re.compile("^Location:")).text.split('Location:')[-1].strip()
            except:
                location = 'N/A'
            postNum = post.find('div', text=re.compile(".*Posts:")).text.split('Posts:')[-1].replace(',','').strip()
            message = post.find('div',{'id':re.compile("^post_message_")}).text.strip()
            
            row = {'date':dateStr,
                   'postNumber':postNo,
                   'poster':postername,
                   'joinDate':joinDate,
                   'location':location,
                   'number of posts':postNum,
                   'thread':discussion,
                   'thread id':threadId,
                   'message':message}
            rows.append(row)
            
        print ('Collected: %s - Page %0s of %s' %(discussion, page,total_pages))


df = pd.DataFrame(rows)


print (stats)
print(df)

Output:

enter image description here

Post a Comment for "Its All About The Logic: Findall Posts & The Corresponding Threads - On Vbulletin"