Mathias
De Design numérique
LA BRIQUE
Un projet d'édition en Web2Print sur le thème des Lost Média
Un script pour récupérer toute les phrases contenant le mot projet
import re
import json
#data = open('file.txt')
#data = data.read()
#print(data)
data = json.load(open('output.json','r'))
data = str(data)
regex = r"\b[^.!?]+project[^a-z][^.!?]+[.!?]+" #cherche toute les phrases contenant le mot project
regex2 =r"\[?\d\]|\\?n\\n|\\n|\\|\[\d"
variable = re.findall(regex, data)
t = 0
with open('project.txt','w',encoding = 'utf-8') as file:
for i in variable:
t = t +1
i = re.sub(regex2,"",i)
#i = i.encode('utf-8')
#i = str(i)
file.write(f'{i}\n')
Un script pour compter chaque mot
import string
import json
data = json.load(open('output.json','r'))
texte = ''.join(str(e) for e in data)
words = {}
stopwords = ["the","in","a","at","for","it","of","is","in","by","to","and","was","on","be","this","an","but","that","as","he"]
texte = texte.lower()
texte = texte.translate(str.maketrans('', '', string.punctuation))
txt = texte.split()
for i in txt:
if i in stopwords:
continue
if i not in words:
words.update({i:1})
else:
words[i] = words[i]+1
newWords = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
#print(newWords)
with open('file.txt', 'w') as file:
for a in newWords:
try:
file.write(f'{a} : {newWords[a]}\n')
except:
pass
Le script pour récupérer le contenu sur le wiki
#la librairie qui permet de se connecter au site
import mechanize
#la librairie qui permet d'analyser la structure html
import lxml.html
#la librairie qui permet de sélectionner une info dans la structure html
import cssselect
import ssl
import json
def getSrc(url):
#1. télécharger la source html de la page url
browser = mechanize.Browser()
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
browser.set_handle_robots(False)
#problème https
browser.set_ca_data(context=ssl._create_unverified_context(cert_reqs=ssl.CERT_NONE))
data = browser.open(url, timeout=10)
'''with open("output.txt", 'w') as f:
f.write(str(data.read()))'''
#2. analyser la source et la transformer en structure html navigable
source = data.read()
html = lxml.html.fromstring(source)
return html
def getHtmlElements(html, selecteurCss):
#3. viser l'information qui nous intéresse
#convertir mon sélecteur css en objet cssselect
selecteurs = cssselect.parse(selecteurCss)
#0
#[objet cssselect]
selecteur = selecteurs[0]
chemin_xpath = cssselect.HTMLTranslator().selector_to_xpath(selecteur)
resultats = html.xpath(chemin_xpath)
return resultats
#print(resultats)
def extractText(textTag):
#print(textTag.text_content())
elements = textTag.xpath(".//h2/span[@class='mw-headline']|.//div[@class='mw-parser-output']/p")
article = ''
for element in elements:
article = article + element.text_content() + '\n'
return article
siteUrl = 'https://www.lostmediawiki.com'
start = True
articlesLinks = []
nextUrl = 'https://www.lostmediawiki.com/index.php?title=Category:Completely_lost_media'
while nextUrl != False:
print('######NEXT PAGE!############')
print('#GOING TO'+nextUrl)
src = getSrc(nextUrl)
nextUrl = False
articlesLinksTags = getHtmlElements(src, ".mw-category-group a")
for articleLinkTag in articlesLinksTags:
#print(siteUrl+articleLinkTag.get('href'))
articlesLinks.append(siteUrl+articleLinkTag.get('href'))
nextPrevLinksTags = getHtmlElements(src, '#mw-pages > a')
for nextPrevLinkTag in nextPrevLinksTags:
print(nextPrevLinkTag.text_content())
if nextPrevLinkTag.text_content() == 'next page':
nextUrl = siteUrl+nextPrevLinkTag.get('href')
break
break
print(articlesLinks)
#[
# {
# 'title':'',
# 'text':''}
# ,
# {
# 'title':'',
# 'text':''}
# ]
articles = []
counter = 0
for articleLink in articlesLinks:
article = {'title':'', 'text':''}
src = getSrc(articleLink)
titleTags = getHtmlElements(src, 'h1#firstHeading')
try:
article['title'] = titleTags[0].text_content()
except:
print("erreur")
textTag = getHtmlElements(src, '.mw-parser-output')
article['text'] = extractText(textTag[0])
articles.append(article)
counter += 1
if(counter == 4):
break
print(articles)
articlesJSON = json.dumps(articles)
with open('output.json', 'w') as f:
f.write(articlesJSON)