engkimbs.tistory.com/613

 

[Python 재무제표 크롤링 #4] Requests, BeautifulSoup로 크롤링(Crawling), 데이터 추출하기(Data Extraction) - 1

| Requests, BeautifulSoup 라이브러리 Requests는 웹상의 html문서를 파이썬 언어를 통해 쉽게 사용자 컴퓨터로 가져올 수 있게 하는 라이브러리입니다. 그리고 BeautifulSoup는 가져온 HTML문서를 파싱하여 

engkimbs.tistory.com

- flask 로 프로젝트 만들기 



-터미널에서 설치 

pip install requests
pip install bs4

html 만들어놓고 

일단 간단하게 만들어놓기 

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    데이터 수집중

</body>
</html>

-app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup.prettify())

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

이렇게 뜸

 


- 영화 포스터 이미지 클릭해서 

class 선택하면 아래처럼 됨

 

 

-app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    print(ul)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

 

-> lst_detail_t1 부분만 나옴 ( 클릭된 곳만)


- 이미지 가져오기 

 

li > div > a > img 

 

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    img = ul.select('li > div > a > img')
    print(len(img))
    for i in img:
        print(i.get('src'))

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

 

이렇게 뜸 


- 전체 관람가 가져오기 

 

    rating = ul.select('li > dl > dt > span')
    for rate in rating:
        print(rate.text)

-app.py 

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    img = ul.select('li > div > a > img')

    rating = ul.select('li > dl > dt > span')
    for rate in rating:
        print(rate.text)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

 

이렇게 뜸


- 영화제목 가져오기 

 

 

    movie_names = ul.select('li > dl > dt > a')
    for movie_name in movie_names:
        print(movie_name.text)
import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    img = ul.select('li > div > a > img')

    movie_names = ul.select('li > dl > dt > a')
    for movie_name in movie_names:
        print(movie_name.text)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

이렇게 뜸 


- 네티즌 글자 가져오기

우클릭해서 copy selector 해보면 

#content > div.article > div:nth-child(1) > div.lst_wrap > ul > li:nth-child(1) > dl > dd.star > dl.info_star > dt

- 이것만 가져오기 

dl > dd.star > dl.info_star > dt

 

    estimate_poeple = ul.select('dl > dd.star > dl.info_star > dt')
    for estimated_p in estimate_poeple:
        print(estimated_p.text)

- app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    ul = soup.find('ul', class_="lst_detail_t1")
    #print(ul)
    img = ul.select('li > div > a > img')
    # print(len(img))

    estimate_poeple = ul.select('dl > dd.star > dl.info_star > dt')
    for estimated_p in estimate_poeple:
        print(estimated_p.text)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

- 평점 가져오기 

#content > div.article > div:nth-child(1) > div.lst_wrap > ul > li:nth-child(1) > dl > dd.star > dl.info_star > dd > div > a > span.num

- 위에 copy한 것 중에서 아래만 가져오기 (그 위에는 ul로 이미 가져와져있어서) 

dl > dd.star > dl.info_star > dd > div > a > span.num

- app.py 

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    ul = soup.find('ul', class_="lst_detail_t1")
    
    estimate_scores = ul.select('dl > dd.star > dl.info_star > dd > div > a > span.num')
    for estimate_score in estimate_scores:
        print(estimate_score.text)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

 

 

 


- 참여 인원 가져오기 

#content > div.article > div:nth-child(1) > div.lst_wrap > ul > li:nth-child(1) > dl > dd.star > dl.info_star > dd > div > a > span.num2 > em

여기서 가져와 쓰기 

 dl > dd.star > dl.info_star > dd > div > a > span.num2 > em
import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn' #네이버영화 주소


@app.route('/')
def data_gathering():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    img = ul.select('li > div > a > img')

    number_of_participants = \
        ul.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')
    for number_of_participant in number_of_participants:
        print(number_of_participant.text)

    return render_template('data_gathering.html')


if __name__ == '__main__':
    app.run()

이렇게 뜸 

 


지금까지 했던 것들 list로 만들기 

 

app.py 

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn'


@app.route('/')
def data_gathering():
    img_list = []
    rating_list = []
    movie_name_list = []
    estimated_people_list = []
    estimate_scores_list = []
    number_of_participants_list = []

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup.prettify())

    ul = soup.find('ul', class_="lst_detail_t1")
    img = ul.select('li > div > a > img')
    # print(len(img))
    for i in img:
        img_list.append(i.get('src'))

    rating = ul.select('li > dl > dt > span')
    for rate in rating:
        rating_list.append(rate.text)
    movie_names = ul.select('li > dl > dt > a')
    for movie_name in movie_names:
        movie_name_list.append(movie_name.text)

    estimated_people = ul.select('dl > dd.star > dl.info_star > dt')
    for estimated_p in estimated_people:
        estimated_people_list.append(estimated_p.text)

    estimate_scores = ul.select('dl > dd.star > dl.info_star > dd > div > a > span.num')
    for estimate_score in estimate_scores:
        estimate_scores_list.append(estimate_score.text)

    number_of_participants = \
        ul.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')
    for number_of_participant in number_of_participants:
        number_of_participants_list.append(number_of_participant.text)

    print(number_of_participants_list)
    return render_template('data_gathering.html');


if __name__ == '__main__':
    app.run()


- 감독, 배우는 여러 명이라서 리스트로 만들었음 

-연령 등급은 없는 것도 있어서 리스트로 만들었음 없으면 빈 리스트로 채워짐 

 

 

- app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn'


@app.route('/')
def hello_world():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    ul = soup.find('ul', class_='lst_detail_t1')
    movie_list = []
    li = ul.find_all('li')
    for piece in li:
        img = piece.find('img')
        actor_list = [act.text for act in piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(6) > span > a')]
        img_src = img.get('src')
        directer = [dir.text for dir in piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(4) > span > a')]
        age = [ag.text for ag in piece.select('dl > dt > span')]
        movie_name = piece.select(' dl > dt > a')[0].text
        netizen = piece.select('dl > dd.star > dl > dd > div > a > span.num')[0].text
        netizen_num = piece.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')[0].text
        movie_list.append([img_src, age, movie_name, netizen, netizen_num, actor_list, directer])
    for Movie in movie_list:
        print(Movie)
    return render_template('data_gathering.html', Movie=movie_list)


if __name__ == '__main__':
    app.run()

 

- data_gathering.html 

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <style>
        .flex-container {
            display: flex;
            flex-direction: row;
            margin: 20px;

            width: 400px;
            margin: 0 auto;

            box-shadow: 2px 2px 4px gray;
        }
    </style>
</head>
<body>
    {% for movie in Movie %}
      <div class="flex-container">
        <img style="width: 100px; height: 150px;" src="{{ movie[0] }}">
     <ul>
         <li>관람가 등급 : {% for i in movie[1] %} {{ i }} {% endfor %}</li>
         <li>제목 : {{ movie[2] }}</li>
         <li>평 점 : {{ movie[3] }}</li>
        <li> 감독 :
            {% for director in movie[6] %}
                {{ director }},
            {% endfor %}
         <li> 배우 :
            {% for actor in movie[5] %}
                {{ actor }},
            {% endfor %}
         </li>
     </ul>
     </div>


    {%  endfor %}
</body>
</html>


-리스트 안에 리스트 말고 class로 만들기 

 

- app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn'


@app.route('/')
def hello_world():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    ul = soup.find('ul', class_='lst_detail_t1')
    movie_list = []
    li = ul.find_all('li')
    for piece in li:
        img = piece.find('img')
        actor_list = [act.text for act in piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(6) > span > a')]
        img_src = img.get('src')
        directer = [dir.text for dir in piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(4) > span > a')]
        age = [ag.text for ag in piece.select('dl > dt > span')]
        movie_name = piece.select(' dl > dt > a')[0].text
        netizen = piece.select('dl > dd.star > dl > dd > div > a > span.num')[0].text
        netizen_num = piece.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')[0].text
        movie_list.append([img_src, age, movie_name, netizen, netizen_num, actor_list, directer])
    for Movie in movie_list:
        print(Movie)
    return render_template('data_gathering.html', Movie=movie_list)


class Book:
    def __init__(self, title, author):
        self.title = title
        self.author = author


@app.route('/study')
def study():
    book_list = []
    book = Book('제목1', '저자1')
    book_list.append(book)

    book = Book('제목2', '저자2')
    book_list.append(book)

    for b in book_list:
        print(b.title, b.author)
    return render_template('study.html', book_list=book_list)


if __name__ == '__main__':
    app.run()

 

-study.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<br>
    {% for book in book_list %}
        {{ book.title }}
        <br>
        {{ book.author }}
    {% endfor %}

</body>
</html>

이런 식으로 간단해짐. 

 

- app.py

import requests
from bs4 import BeautifulSoup
from flask import Flask, render_template

app = Flask(__name__)

url = 'https://movie.naver.com/movie/running/current.nhn'


class NaverMovie:
    def __init__(self, img_src, age, movie_name, netizen, netizen_num,\
                 actor_list, directer):
        self.img_src = img_src
        self.age = age
        self.movie_name = movie_name
        self.netizen = netizen
        self.netizen_num = netizen_num
        self.actor_list = actor_list
        self.actor_list = actor_list
        self.directer = directer


@app.route('/')
def hello_world():
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    ul = soup.find('ul', class_='lst_detail_t1')
    movie_list = []
    li = ul.find_all('li')
    for piece in li:
        img = piece.find('img')
        actor_list = [act.text for act in piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(6) > span > a')]
        img_src = img.get('src')
        directer = piece.select('dl > dd:nth-child(3) > dl > dd:nth-child(4) > span > a')[0].text
        age = [ag.text for ag in piece.select('dl > dt > span')]
        movie_name = piece.select(' dl > dt > a')[0].text
        netizen = piece.select('dl > dd.star > dl > dd > div > a > span.num')[0].text
        netizen_num = piece.select('dl > dd.star > dl.info_star > dd > div > a > span.num2 > em')[0].text
        naverMovie = NaverMovie(img_src, age, movie_name, netizen, netizen_num, actor_list, directer)
        movie_list.append(naverMovie)
        #movie_list.append([img_src, age, movie_name, netizen, netizen_num, actor_list, directer])
    for Movie in movie_list:
        print(Movie)
    return render_template('data_gathering.html', movie_list=movie_list)


class Book:
    def __init__(self, title, author):
        self.title = title
        self.author = author


@app.route('/study')
def study():
    book_list = []
    book = Book('제목1', '저자1')
    book_list.append(book)

    book = Book('제목2', '저자2')
    book_list.append(book)

    for b in book_list:
        print(b.title, b.author)

    return render_template('study.html', book_list=book_list)


if __name__ == '__main__':
    app.run()

 

- data_gathering.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <style>
        .flex-container {
            display: flex;
            flex-direction: row;
            margin: 20px;

            width: 400px;
            margin: 0 auto;

            box-shadow: 2px 2px 4px gray;
        }
    </style>
</head>
<body>
    {% for movie in movie_list %}
      <div class="flex-container">
        <img style="width: 100px; height: 150px;" src="{{ movie.img_src }}">
     <ul>
         <li>관람가 등급 : {% for i in movie.age %} {{ i }} {% endfor %}</li>
         <li>제목 : {{ movie.movie_name }}</li>
         <li>평 점 : {{ movie.netizen }}</li>
         <li>감독 : {{ movie.directer }}</li>
         <li> 배우 :
            {% for actor in movie.actor_list %}
                {{ actor }},
            {% endfor %}
         </li>
     </ul>
     </div>


    {%  endfor %}
</body>
</html>

* a = [1]

print(a) -> [1] 로 되니까 .text가 안됨 

그래서 print(a[0]) -> 1 이렇게 값만 뽑은거 

'mongodb' 카테고리의 다른 글

mysql  (0) 2021.01.27
로또 웹  (0) 2021.01.27
mysql - python 연동  (0) 2021.01.26
mysql  (0) 2021.01.25
flex box  (0) 2021.01.25

+ Recent posts