Mathias : Différence entre versions
(Page créée avec « <syntaxhighlight lang = "python"> #la librairie qui permet de se connecter au site import mechanize #la librairie qui permet d'analyser la structure html import lxml.html... ») |
|||
(5 révisions intermédiaires par 2 utilisateurs non affichées) | |||
Ligne 1 : | Ligne 1 : | ||
− | <syntaxhighlight lang = "python"> | + | <h2>LA BRIQUE</h2> |
+ | <h3>Un projet d'édition en Web2Print sur le thème des Lost Média</h3> | ||
+ | |||
+ | <h4>Un script pour récupérer toute les phrases contenant le mot projet</h4> | ||
+ | <syntaxhighlight lang="python"> | ||
+ | import re | ||
+ | import json | ||
+ | |||
+ | #data = open('file.txt') | ||
+ | #data = data.read() | ||
+ | #print(data) | ||
+ | data = json.load(open('output.json','r')) | ||
+ | data = str(data) | ||
+ | regex = r"\b[^.!?]+project[^a-z][^.!?]+[.!?]+" #cherche toute les phrases contenant le mot project | ||
+ | regex2 =r"\[?\d\]|\\?n\\n|\\n|\\|\[\d" | ||
+ | |||
+ | variable = re.findall(regex, data) | ||
+ | t = 0 | ||
+ | |||
+ | with open('project.txt','w',encoding = 'utf-8') as file: | ||
+ | for i in variable: | ||
+ | t = t +1 | ||
+ | i = re.sub(regex2,"",i) | ||
+ | #i = i.encode('utf-8') | ||
+ | #i = str(i) | ||
+ | file.write(f'{i}\n') | ||
+ | |||
+ | |||
+ | </syntaxhighlight > | ||
+ | |||
+ | |||
+ | <h4>Un script pour compter chaque mot</h4> | ||
+ | <syntaxhighlight lang="python"> | ||
+ | import string | ||
+ | import json | ||
+ | |||
+ | data = json.load(open('output.json','r')) | ||
+ | texte = ''.join(str(e) for e in data) | ||
+ | |||
+ | words = {} | ||
+ | stopwords = ["the","in","a","at","for","it","of","is","in","by","to","and","was","on","be","this","an","but","that","as","he"] | ||
+ | |||
+ | texte = texte.lower() | ||
+ | texte = texte.translate(str.maketrans('', '', string.punctuation)) | ||
+ | txt = texte.split() | ||
+ | |||
+ | for i in txt: | ||
+ | |||
+ | if i in stopwords: | ||
+ | continue | ||
+ | |||
+ | if i not in words: | ||
+ | words.update({i:1}) | ||
+ | |||
+ | else: | ||
+ | words[i] = words[i]+1 | ||
+ | |||
+ | newWords = dict(sorted(words.items(), key=lambda item: item[1], reverse=True)) | ||
+ | #print(newWords) | ||
+ | |||
+ | with open('file.txt', 'w') as file: | ||
+ | for a in newWords: | ||
+ | try: | ||
+ | file.write(f'{a} : {newWords[a]}\n') | ||
+ | except: | ||
+ | pass | ||
+ | </syntaxhighlight > | ||
+ | |||
+ | <h4>Le script pour récupérer le contenu sur le wiki</h4> | ||
+ | <syntaxhighlight lang="python"> | ||
+ | |||
#la librairie qui permet de se connecter au site | #la librairie qui permet de se connecter au site | ||
import mechanize | import mechanize | ||
Ligne 8 : | Ligne 78 : | ||
import ssl | import ssl | ||
+ | import json | ||
− | def | + | |
+ | def getSrc(url): | ||
#1. télécharger la source html de la page url | #1. télécharger la source html de la page url | ||
browser = mechanize.Browser() | browser = mechanize.Browser() | ||
Ligne 27 : | Ligne 99 : | ||
source = data.read() | source = data.read() | ||
html = lxml.html.fromstring(source) | html = lxml.html.fromstring(source) | ||
+ | return html | ||
− | + | def getHtmlElements(html, selecteurCss): | |
− | + | ||
#3. viser l'information qui nous intéresse | #3. viser l'information qui nous intéresse | ||
Ligne 46 : | Ligne 119 : | ||
#print(resultats) | #print(resultats) | ||
− | + | def extractText(textTag): | |
+ | #print(textTag.text_content()) | ||
+ | elements = textTag.xpath(".//h2/span[@class='mw-headline']|.//div[@class='mw-parser-output']/p") | ||
+ | article = '' | ||
+ | for element in elements: | ||
+ | article = article + element.text_content() + '\n' | ||
+ | |||
+ | return article | ||
+ | |||
+ | |||
+ | siteUrl = 'https://www.lostmediawiki.com' | ||
+ | start = True | ||
+ | |||
+ | articlesLinks = [] | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | nextUrl = 'https://www.lostmediawiki.com/index.php?title=Category:Completely_lost_media' | ||
+ | |||
+ | |||
+ | while nextUrl != False: | ||
+ | print('######NEXT PAGE!############') | ||
+ | print('#GOING TO'+nextUrl) | ||
+ | |||
+ | src = getSrc(nextUrl) | ||
+ | nextUrl = False | ||
+ | |||
+ | articlesLinksTags = getHtmlElements(src, ".mw-category-group a") | ||
+ | |||
+ | for articleLinkTag in articlesLinksTags: | ||
+ | #print(siteUrl+articleLinkTag.get('href')) | ||
+ | articlesLinks.append(siteUrl+articleLinkTag.get('href')) | ||
+ | |||
+ | |||
+ | |||
+ | nextPrevLinksTags = getHtmlElements(src, '#mw-pages > a') | ||
+ | |||
+ | for nextPrevLinkTag in nextPrevLinksTags: | ||
+ | print(nextPrevLinkTag.text_content()) | ||
+ | if nextPrevLinkTag.text_content() == 'next page': | ||
+ | nextUrl = siteUrl+nextPrevLinkTag.get('href') | ||
+ | break | ||
+ | |||
+ | break | ||
+ | |||
+ | |||
+ | print(articlesLinks) | ||
− | + | #[ | |
+ | # { | ||
+ | # 'title':'', | ||
+ | # 'text':''} | ||
+ | # , | ||
+ | # { | ||
+ | # 'title':'', | ||
+ | # 'text':''} | ||
+ | # ] | ||
− | |||
− | |||
− | |||
articles = [] | articles = [] | ||
− | #print( | + | counter = 0 |
+ | for articleLink in articlesLinks: | ||
+ | article = {'title':'', 'text':''} | ||
+ | |||
+ | src = getSrc(articleLink) | ||
+ | |||
+ | titleTags = getHtmlElements(src, 'h1#firstHeading') | ||
+ | |||
+ | try: | ||
+ | article['title'] = titleTags[0].text_content() | ||
+ | except: | ||
+ | print("erreur") | ||
+ | |||
+ | |||
+ | textTag = getHtmlElements(src, '.mw-parser-output') | ||
+ | |||
+ | article['text'] = extractText(textTag[0]) | ||
+ | |||
+ | articles.append(article) | ||
+ | |||
+ | counter += 1 | ||
+ | if(counter == 4): | ||
+ | break | ||
− | |||
− | |||
− | |||
− | |||
print(articles) | print(articles) | ||
− | + | ||
− | </ | + | articlesJSON = json.dumps(articles) |
+ | |||
+ | with open('output.json', 'w') as f: | ||
+ | f.write(articlesJSON) | ||
+ | |||
+ | </syntaxhighlight > |
Version actuelle datée du 24 février 2023 à 11:25
Sommaire
LA BRIQUE
Un projet d'édition en Web2Print sur le thème des Lost Média
Un script pour récupérer toute les phrases contenant le mot projet
import re
import json
#data = open('file.txt')
#data = data.read()
#print(data)
data = json.load(open('output.json','r'))
data = str(data)
regex = r"\b[^.!?]+project[^a-z][^.!?]+[.!?]+" #cherche toute les phrases contenant le mot project
regex2 =r"\[?\d\]|\\?n\\n|\\n|\\|\[\d"
variable = re.findall(regex, data)
t = 0
with open('project.txt','w',encoding = 'utf-8') as file:
for i in variable:
t = t +1
i = re.sub(regex2,"",i)
#i = i.encode('utf-8')
#i = str(i)
file.write(f'{i}\n')
Un script pour compter chaque mot
import string
import json
data = json.load(open('output.json','r'))
texte = ''.join(str(e) for e in data)
words = {}
stopwords = ["the","in","a","at","for","it","of","is","in","by","to","and","was","on","be","this","an","but","that","as","he"]
texte = texte.lower()
texte = texte.translate(str.maketrans('', '', string.punctuation))
txt = texte.split()
for i in txt:
if i in stopwords:
continue
if i not in words:
words.update({i:1})
else:
words[i] = words[i]+1
newWords = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
#print(newWords)
with open('file.txt', 'w') as file:
for a in newWords:
try:
file.write(f'{a} : {newWords[a]}\n')
except:
pass
Le script pour récupérer le contenu sur le wiki
#la librairie qui permet de se connecter au site
import mechanize
#la librairie qui permet d'analyser la structure html
import lxml.html
#la librairie qui permet de sélectionner une info dans la structure html
import cssselect
import ssl
import json
def getSrc(url):
#1. télécharger la source html de la page url
browser = mechanize.Browser()
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
browser.set_handle_robots(False)
#problème https
browser.set_ca_data(context=ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE))
data = browser.open(url, timeout=10)
'''with open("output.txt", 'w') as f:
f.write(str(data.read()))'''
#2. analyser la source et la transformer en structure html navigable
source = data.read()
html = lxml.html.fromstring(source)
return html
def getHtmlElements(html, selecteurCss):
#3. viser l'information qui nous intéresse
#convertir mon sélecteur css en objet cssselect
selecteurs = cssselect.parse(selecteurCss)
#0
#[objet cssselect]
selecteur = selecteurs[0]
chemin_xpath = cssselect.HTMLTranslator().selector_to_xpath(selecteur)
resultats = html.xpath(chemin_xpath)
return resultats
#print(resultats)
def extractText(textTag):
#print(textTag.text_content())
elements = textTag.xpath(".//h2/span[@class='mw-headline']|.//div[@class='mw-parser-output']/p")
article = ''
for element in elements:
article = article + element.text_content() + '\n'
return article
siteUrl = 'https://www.lostmediawiki.com'
start = True
articlesLinks = []
nextUrl = 'https://www.lostmediawiki.com/index.php?title=Category:Completely_lost_media'
while nextUrl != False:
print('######NEXT PAGE!############')
print('#GOING TO'+nextUrl)
src = getSrc(nextUrl)
nextUrl = False
articlesLinksTags = getHtmlElements(src, ".mw-category-group a")
for articleLinkTag in articlesLinksTags:
#print(siteUrl+articleLinkTag.get('href'))
articlesLinks.append(siteUrl+articleLinkTag.get('href'))
nextPrevLinksTags = getHtmlElements(src, '#mw-pages > a')
for nextPrevLinkTag in nextPrevLinksTags:
print(nextPrevLinkTag.text_content())
if nextPrevLinkTag.text_content() == 'next page':
nextUrl = siteUrl+nextPrevLinkTag.get('href')
break
break
print(articlesLinks)
#[
# {
# 'title':'',
# 'text':''}
# ,
# {
# 'title':'',
# 'text':''}
# ]
articles = []
counter = 0
for articleLink in articlesLinks:
article = {'title':'', 'text':''}
src = getSrc(articleLink)
titleTags = getHtmlElements(src, 'h1#firstHeading')
try:
article['title'] = titleTags[0].text_content()
except:
print("erreur")
textTag = getHtmlElements(src, '.mw-parser-output')
article['text'] = extractText(textTag[0])
articles.append(article)
counter += 1
if(counter == 4):
break
print(articles)
articlesJSON = json.dumps(articles)
with open('output.json', 'w') as f:
f.write(articlesJSON)