# coding=utf-8
"""
"""
import hashlib
import datetime
import requests
from bs4 import BeautifulSoup
from .conf import settings
from django.core.files.uploadedfile import SimpleUploadedFile
from .models import HatenablogEntry
# RSS2のpubDateのフォーマット
PUBDATE_FORMAT = '%a, %d %b %Y %H:%M:%S %z'
[ドキュメント]class HatenablogFeedScraper(object):
def __init__(self, url=None, verbose=False):
self.url = url or settings.ACTIVITIES_HATENABLOG_FEED_URL
self.verbose = verbose
[ドキュメント] def fetch(self):
r = requests.get(self.url)
r.raise_for_status()
s = BeautifulSoup(r.text, 'lxml-xml')
entries = s.find_all('item')
n = len(entries)
ncreated = 0
for i, entry in enumerate(entries):
url = entry.link.string
title = entry.title.string
m = hashlib.md5(entry.description.string.encode('utf-8')).hexdigest()
if self.verbose:
print("- Fetching entry '{}'... ({}/{})".format(
title, i+1, n,
))
pub_date = entry.pubDate.string
created_at = datetime.datetime.strptime(pub_date,
PUBDATE_FORMAT)
thumbnail = self._fetch_entry_thumbnail(entry)
obj, created = HatenablogEntry.objects.update_or_create(
url=url,
defaults=dict(
title=title,
created_at=created_at,
thumbnail=thumbnail,
md5=str(m),
),
)
if created:
ncreated += 1
return ncreated, n - ncreated
def _fetch_entry_thumbnail(self, entry):
r = requests.get(entry.link.string)
r.raise_for_status()
s = BeautifulSoup(r.text, 'lxml')
thumbnail_url = None
for meta in s.find_all('meta'):
if meta.get('property', None) == 'og:image':
thumbnail_url = meta.get('content')
if not thumbnail_url:
return None
r = requests.get(thumbnail_url)
filename = thumbnail_url.split('/')[-1]
img = SimpleUploadedFile(filename, r.content)
return img