playwright를 이용해 자바스크립트로 네이버 블로그 본문 내용 스크래핑하기

2025-03-11 |

playwright를 이용하여 블로그의 제목과 본문을 스크래핑하여 마크다운 파일로 저장하는 자바스크립트 소스입니다.

프로젝트 초기화

프로젝트를 설정하기 위해 bash 창에서 아래와 같은 과정을 거쳐 프로젝트 초기설정을 진행합니다.

mkdir blog_scraper #폴더생성
cd blog_scraper #생성된 폴더로 이동
npm init -y #package.json 생성
touch index.js #index.js 파일 생성
npm install playwright #playwright 패키지 설치
npx playwright install #playwright 실행에 필요한 브라우저 설치

Package.json

module 형식으로 type을 변경해줍니다.

{
  "name": "blog_scraper",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "type": "module", // commonjs로 되어있는 것을 module로 변경
  "dependencies": {
    "playwright": "^1.51.0"
  }
}

index.js

블로그의 제목과 본문을 가져와서 마크다운 파일로 저장하는 소스입니다.

import playwright from "playwright"
import path from "path"
import fs from "fs"

const url = process.argv[2] // 2칸 이상의 공백제거, \n을 줄바꿈으로 적용

const textFilter = (text) => text.replace(/\s{2,}/g, " ").replace(/\n/g, "\n") // 본문 내용을 마크다운 형식으로 변환

const convertMarkdown = (data) => {
  const filteredTitle = textFilter(data.title)
  const filteredContent = textFilter(data.body)
  return `#${filteredTitle}\n\n${filteredContent}`
} // URL로부터 제목과 본문 스크래핑

const getTextFromUrl = async (url) => {
  const browser = await playwright.chromium.launch({
    headless: false, // true일 경우 브라우저 띄우지 않고 작동
  })

  const page = await browser.newPage()
  await page.goto(url)

  // iframe 선택
  const frameElementHandle = await page.$("iframe")
  const frame = await frameElementHandle.contentFrame()

  // 1초 대기
  await page.waitForTimeout(1000)

  // iframe 내부에 접근하여 제목과 본문 데이터 수집
  const content = await frame.$eval(".se-viewer", (content) => {
    const title = content.querySelector(".se-title-text").textContent
    const body = content.querySelector(".se-main-container").textContent
    return {
      title,
      body,
    }
  })

  // 1초 대기
  await page.waitForTimeout(1000)
  // 브라우저 닫기
  await browser.close()
  // 수집된 제목과 본문 반환
  return content
}

const main = async (url) => {
  // 본문과 제목 수집
  const content = await getTextFromUrl(url) // 10글자의 파일 제목 생성
  const fileName = content.title
    .trim()
    .replace(/[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣ]/g, "")
    .slice(0, 10)

  // 마크다운 형식으로 변환
  const markdown = convertMarkdown(content)

  // 마크다운 파일 저장
  fs.writeFileSync(`${fileName}.md`, markdown)

  console.log("파일저장 완료")
}

// url로부터 데이터 수집하여 저장
await main(url).catch((e) => console.error(e))

실행

실행은 블로그 URL을 긁어서 node index.js 뒤에 붙여주면됩니다. 스크래핑 작업이 완료되면, 마크다운 형식으로 블로그의 텍스트가 저장됩니다.

node index.js https://blog.naver.com/아이디/글주소