« Mathias » : différence entre les versions

Dernière version du 24 février 2023 à 11:25

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

import re
import json

#data = open('file.txt')
#data = data.read()
#print(data)
data = json.load(open('output.json','r'))
data = str(data)
regex = r"\b[^.!?]+project[^a-z][^.!?]+[.!?]+"  #cherche toute les phrases contenant le mot project
regex2 =r"\[?\d\]|\\?n\\n|\\n|\\|\[\d"

variable = re.findall(regex, data)
t = 0

with open('project.txt','w',encoding = 'utf-8') as file:
    for i in variable:
        t = t +1
        i = re.sub(regex2,"",i)
        #i = i.encode('utf-8')
        #i = str(i)
        file.write(f'{i}\n')

Un script pour compter chaque mot

import string
import json

data = json.load(open('output.json','r'))
texte = ''.join(str(e) for e in data)

words = {}
stopwords = ["the","in","a","at","for","it","of","is","in","by","to","and","was","on","be","this","an","but","that","as","he"]

texte = texte.lower()
texte = texte.translate(str.maketrans('', '', string.punctuation))
txt = texte.split()

for i in txt:

    if i in stopwords:
        continue

    if i not in words:
        words.update({i:1})
    
    else:
        words[i] = words[i]+1
        
newWords = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
#print(newWords)

with open('file.txt', 'w') as file:
    for a in newWords:
        try:
            file.write(f'{a} : {newWords[a]}\n')
        except:
            pass

Le script pour récupérer le contenu sur le wiki

#la librairie qui permet de se connecter au site
import mechanize
#la librairie qui permet d'analyser la structure html
import lxml.html
#la librairie qui permet de sélectionner une info dans la structure html
import cssselect

import ssl
import json


def getSrc(url):
    #1. télécharger la source html de la page url
    browser = mechanize.Browser()
    browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    browser.set_handle_robots(False)
 
    #problème https
    browser.set_ca_data(context=ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE))
 
    data = browser.open(url, timeout=10)
 
    '''with open("output.txt", 'w') as f:
        f.write(str(data.read()))'''
 
 
    #2. analyser la source et la transformer en structure html navigable
    source = data.read()
    html = lxml.html.fromstring(source)
    return html

def getHtmlElements(html, selecteurCss):
    
    #3. viser l'information qui nous intéresse
 
    #convertir mon sélecteur css en objet cssselect
    selecteurs = cssselect.parse(selecteurCss)
        #0
    #[objet cssselect]
 
    selecteur = selecteurs[0]
 
    chemin_xpath = cssselect.HTMLTranslator().selector_to_xpath(selecteur)
 
    resultats = html.xpath(chemin_xpath)
 
    return resultats
    #print(resultats)

def extractText(textTag):
    #print(textTag.text_content())
    elements = textTag.xpath(".//h2/span[@class='mw-headline']|.//div[@class='mw-parser-output']/p")
    article = ''
    for element in elements:
        article = article + element.text_content() + '\n'

    return article


siteUrl = 'https://www.lostmediawiki.com'
start = True

articlesLinks = []





nextUrl = 'https://www.lostmediawiki.com/index.php?title=Category:Completely_lost_media'


while nextUrl != False:
    print('######NEXT PAGE!############')
    print('#GOING TO'+nextUrl)

    src = getSrc(nextUrl)
    nextUrl = False

    articlesLinksTags = getHtmlElements(src, ".mw-category-group a")

    for articleLinkTag in articlesLinksTags:
        #print(siteUrl+articleLinkTag.get('href'))
        articlesLinks.append(siteUrl+articleLinkTag.get('href'))



    nextPrevLinksTags = getHtmlElements(src, '#mw-pages > a')

    for nextPrevLinkTag in nextPrevLinksTags:
        print(nextPrevLinkTag.text_content())
        if nextPrevLinkTag.text_content() == 'next page':
            nextUrl = siteUrl+nextPrevLinkTag.get('href')
            break
    
    break


print(articlesLinks)

#[
# {
# 'title':'',
# 'text':''}
# ,
# {
# 'title':'',
# 'text':''}
# ]

articles = []
counter = 0
for articleLink in articlesLinks:
    article = {'title':'', 'text':''}

    src = getSrc(articleLink)

    titleTags = getHtmlElements(src, 'h1#firstHeading')

    try:
        article['title'] = titleTags[0].text_content()
    except:
        print("erreur")


    textTag = getHtmlElements(src, '.mw-parser-output')

    article['text'] = extractText(textTag[0])

    articles.append(article)

    counter += 1
    if(counter == 4):
        break


print(articles)

articlesJSON = json.dumps(articles)

with open('output.json', 'w') as f:
    f.write(articlesJSON)

« Mathias » : différence entre les versions

De Design numérique

Dernière version du 24 février 2023 à 11:25

Sommaire

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

Un script pour compter chaque mot

Le script pour récupérer le contenu sur le wiki

« Mathias » : différence entre les versions

De Design numérique

Dernière version du 24 février 2023 à 11:25

LA BRIQUE

Un projet d'édition en Web2Print sur le thème des Lost Média

Un script pour récupérer toute les phrases contenant le mot projet

Un script pour compter chaque mot

Le script pour récupérer le contenu sur le wiki

« Mathias » : différence entre les versions