Mathias

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

import re
import json

#data = open('file.txt')
#data = data.read()
#print(data)
data = json.load(open('output.json','r'))
data = str(data)
regex = r"\b[^.!?]+project[^a-z][^.!?]+[.!?]+"  #cherche toute les phrases contenant le mot project
regex2 =r"\[?\d\]|\\?n\\n|\\n|\\|\[\d"

variable = re.findall(regex, data)
t = 0

with open('project.txt','w',encoding = 'utf-8') as file:
    for i in variable:
        t = t +1
        i = re.sub(regex2,"",i)
        #i = i.encode('utf-8')
        #i = str(i)
        file.write(f'{i}\n')

Un script pour compter chaque mot

import string
import json

data = json.load(open('output.json','r'))
texte = ''.join(str(e) for e in data)

words = {}
stopwords = ["the","in","a","at","for","it","of","is","in","by","to","and","was","on","be","this","an","but","that","as","he"]

texte = texte.lower()
texte = texte.translate(str.maketrans('', '', string.punctuation))
txt = texte.split()

for i in txt:

    if i in stopwords:
        continue

    if i not in words:
        words.update({i:1})
    
    else:
        words[i] = words[i]+1
        
newWords = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
#print(newWords)

with open('file.txt', 'w') as file:
    for a in newWords:
        try:
            file.write(f'{a} : {newWords[a]}\n')
        except:
            pass

Le script pour récupérer le contenu sur le wiki

#la librairie qui permet de se connecter au site
import mechanize
#la librairie qui permet d'analyser la structure html
import lxml.html
#la librairie qui permet de sélectionner une info dans la structure html
import cssselect

import ssl
import json


def getSrc(url):
    #1. télécharger la source html de la page url
    browser = mechanize.Browser()
    browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    browser.set_handle_robots(False)
 
    #problème https
    browser.set_ca_data(context=ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE))
 
    data = browser.open(url, timeout=10)
 
    '''with open("output.txt", 'w') as f:
        f.write(str(data.read()))'''
 
 
    #2. analyser la source et la transformer en structure html navigable
    source = data.read()
    html = lxml.html.fromstring(source)
    return html

def getHtmlElements(html, selecteurCss):
    
    #3. viser l'information qui nous intéresse
 
    #convertir mon sélecteur css en objet cssselect
    selecteurs = cssselect.parse(selecteurCss)
        #0
    #[objet cssselect]
 
    selecteur = selecteurs[0]
 
    chemin_xpath = cssselect.HTMLTranslator().selector_to_xpath(selecteur)
 
    resultats = html.xpath(chemin_xpath)
 
    return resultats
    #print(resultats)

def extractText(textTag):
    #print(textTag.text_content())
    elements = textTag.xpath(".//h2/span[@class='mw-headline']|.//div[@class='mw-parser-output']/p")
    article = ''
    for element in elements:
        article = article + element.text_content() + '\n'

    return article


siteUrl = 'https://www.lostmediawiki.com'
start = True

articlesLinks = []





nextUrl = 'https://www.lostmediawiki.com/index.php?title=Category:Completely_lost_media'


while nextUrl != False:
    print('######NEXT PAGE!############')
    print('#GOING TO'+nextUrl)

    src = getSrc(nextUrl)
    nextUrl = False

    articlesLinksTags = getHtmlElements(src, ".mw-category-group a")

    for articleLinkTag in articlesLinksTags:
        #print(siteUrl+articleLinkTag.get('href'))
        articlesLinks.append(siteUrl+articleLinkTag.get('href'))



    nextPrevLinksTags = getHtmlElements(src, '#mw-pages > a')

    for nextPrevLinkTag in nextPrevLinksTags:
        print(nextPrevLinkTag.text_content())
        if nextPrevLinkTag.text_content() == 'next page':
            nextUrl = siteUrl+nextPrevLinkTag.get('href')
            break
    
    break


print(articlesLinks)

#[
# {
# 'title':'',
# 'text':''}
# ,
# {
# 'title':'',
# 'text':''}
# ]

articles = []
counter = 0
for articleLink in articlesLinks:
    article = {'title':'', 'text':''}

    src = getSrc(articleLink)

    titleTags = getHtmlElements(src, 'h1#firstHeading')

    try:
        article['title'] = titleTags[0].text_content()
    except:
        print("erreur")


    textTag = getHtmlElements(src, '.mw-parser-output')

    article['text'] = extractText(textTag[0])

    articles.append(article)

    counter += 1
    if(counter == 4):
        break


print(articles)

articlesJSON = json.dumps(articles)

with open('output.json', 'w') as f:
    f.write(articlesJSON)

Mathias

De Design numérique

Version datée du 24 février 2023 à 11:25 par MathiasR (discussion | contributions)
(diff) ← Version précédente | Version actuelle (diff) | Version suivante → (diff)

Sommaire

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

Un script pour compter chaque mot

Le script pour récupérer le contenu sur le wiki

Mathias

De Design numérique

Version datée du 24 février 2023 à 11:25 par MathiasR (discussion | contributions)(diff) ← Version précédente | Version actuelle (diff) | Version suivante → (diff)

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

Un script pour compter chaque mot

Le script pour récupérer le contenu sur le wiki

Version datée du 24 février 2023 à 11:25 par MathiasR (discussion | contributions)
(diff) ← Version précédente | Version actuelle (diff) | Version suivante → (diff)