grainParisArt/modules/scraping.py

153 lines
7.2 KiB
Python
Raw Normal View History

2024-09-16 11:29:02 +00:00
from bs4 import BeautifulSoup
import requests
import requests_cache
from datetime import timedelta
from modules.firebase import enregistrementFilm, recupererDataFilm
requests_cache.install_cache('film_cache', expire_after=timedelta(minutes=5))
# Récolte les données
def scrap_infoFilm(url, cinema):
films = []
response = requests.get(url)
reponse_text = response.text
soupReponse = BeautifulSoup(reponse_text, 'html.parser')
# films_list = soupReponse.find('div', class_="showtimes-list-holder").find_all('div', class_="card entity-card entity-card-list movie-card-theater cf hred")
films_list_container = soupReponse.find('div', class_="showtimes-list-holder")
if films_list_container:
films_list = films_list_container.find_all('div', class_="card entity-card entity-card-list movie-card-theater cf hred")
for film in films_list:
titre = film.find("div", class_="meta").find('h2', class_="meta-title").find("a").get_text()
realisateur_section = film.find("div", class_="meta-body-item meta-body-direction")
if realisateur_section:
realisateur = realisateur_section.find('span', class_="dark-grey-link").get_text()
else:
realisateur = "Réalisateur non trouvé"
dataFilm_firebase = recupererDataFilm(titre, realisateur)
if dataFilm_firebase == 0:
# Extraction de l'image
thumbnail_img = film.find('img', class_='thumbnail-img')
if thumbnail_img and not thumbnail_img['src'].startswith('data:image'):
img_url = thumbnail_img['src']
else:
urlAffiche = "https://www.allocine.fr" + film.find("div", class_="meta").find('h2', class_="meta-title").find("a")['href']
responseAffiche = requests.get(urlAffiche)
pageFilm = BeautifulSoup(responseAffiche.text, 'html.parser')
thumbnail_img = pageFilm.find('img', class_='thumbnail-img')
img_url = thumbnail_img['src'] if thumbnail_img and not thumbnail_img['src'].startswith('data:image') else 'Image de la vignette non trouvée'
synopsis = film.find('div', class_="synopsis").find('div', class_="content-txt").get_text() if film.find('div', class_="synopsis") else "synopsis non trouvé"
acteur_container = film.find("div", class_="meta-body-item meta-body-actor")
acteurs = [acteur.get_text() for acteur in acteur_container.find_all("span", class_="dark-grey-link")] if acteur_container else ["acteurs non trouvés"]
horaire_sections = film.find_all("div", class_="showtimes-hour-block")
horaires = [horaire_section.find('span', class_="showtimes-hour-item-value").get_text() for horaire_section in horaire_sections if horaire_section.find('span', class_="showtimes-hour-item-value")] or ["Horaire non trouvé"]
genre_container = film.find("div", class_="meta-body-item meta-body-info")
genres = [span.get_text().strip() for span in genre_container.find_all("span") if 'class' in span.attrs and not span.attrs['class'][0].startswith('spacer') and 'nationality' not in span.attrs['class']] if genre_container else ["Genre non trouvé"]
if genres: genres.pop(0)
# Récupération de la durée du film
url = "https://api.themoviedb.org/3/search/movie?query=" + titre
headers = {
"accept": "application/json",
"Authorization": "###"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
if data['results']:
film_id = data['results'][0]['id']
url = "https://api.themoviedb.org/3/movie/" + str(film_id)
response = requests.get(url, headers=headers)
data = response.json()
duree_film = data['runtime']
heure = duree_film // 60
minute = duree_film % 60
else:
heure = 0
minute = 0
else:
heure = 0
minute = 0
film_data = {
"titre": titre,
"realisateur": realisateur,
"casting": acteurs,
"genres": genres,
"duree": {"heure": heure, "minute": minute},
"affiche": img_url,
"synopsis": synopsis,
"horaires": [
{
"cinema": cinema,
"seances": horaires
}
]
}
enregistrementFilm(film_data)
print(f"{film_data['titre']} : enregistré dans la db")
else:
horaire_sections = film.find_all("div", class_="showtimes-hour-block")
horaires = [horaire_section.find('span', class_="showtimes-hour-item-value").get_text() for horaire_section in horaire_sections if horaire_section.find('span', class_="showtimes-hour-item-value")] or ["Horaire non trouvé"]
film_data = {
"titre": dataFilm_firebase['titre'],
"realisateur": dataFilm_firebase['realisateur'],
"casting": dataFilm_firebase['casting'],
'genres': ['Drame', 'Romance'],
"duree": dataFilm_firebase['duree'],
"affiche": dataFilm_firebase['affiche'],
"synopsis": dataFilm_firebase['synopsis'],
"horaires": [
{
"cinema": cinema,
"seances": horaires
}
]
}
print(f"{film_data['titre']} : récupéré dans la db")
# Ajout du film s'il n'existe pas déjà
existing_film = next((f for f in films if f["titre"] == titre), None)
if existing_film:
existing_film["horaires"].append({
"cinema": cinema,
"seances": horaires
})
else:
films.append(film_data)
else:
print(f"L'élément 'showtimes-list-holder' n'a pas été trouvé pour l'URL {url}")
films_list = []
return films
def get_data(cinemas):
films = []
for cinema in cinemas:
result = scrap_infoFilm(cinema["url"], cinema["salle"])
films.extend(result)
return films
def cleanFilms(films):
filmsClean = []
for film in films:
existing_film = next((f for f in filmsClean if f["titre"] == film["titre"]), None)
if existing_film:
existing_film["horaires"].append({
"cinema": film["horaires"][0]["cinema"],
"seances": film["horaires"][0]["seances"]
})
else:
filmsClean.append(film)
return filmsClean