Mass loading but overrunning API limits

This commit is contained in:
Keith Edmunds 2023-01-10 17:09:06 +00:00
parent e6d8f10fe3
commit b797746229
7 changed files with 262 additions and 115 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ tags
Session.vim
.direnv
.envrc
testdata/

View File

@ -19,4 +19,6 @@ class Config(object):
MAIL_SERVER = os.environ.get('MAIL_SERVER') or "woodlands.midnighthax.com"
MAIL_USERNAME = os.environ.get('MAIL_USERNAME')
MAIL_USE_TLS = os.environ.get('MAIL_USE_TLS') is not None
MAX_CONTENT_LENGTH = 4096
MAX_POSTS_TO_FETCH = 2000
NORMAL_COLOUR = "#f6f5f4"

View File

@ -21,13 +21,13 @@ def ask_yes_no(title: str, question: str) -> bool:
return button_reply == QMessageBox.Yes
def format_username(account) -> str:
def format_display_name(account) -> str:
"""
Format account username according to whether we follow that account
Format account display name according to whether we follow that account
or not.
"""
username = account.username
username = account.display_name
if account.followed:
colour = Config.FOLLOWED_COLOUR
else:

View File

@ -11,6 +11,7 @@ from sqlalchemy import (
Column,
DateTime,
ForeignKey,
func,
Integer,
select,
String,
@ -208,7 +209,8 @@ class Posts(Base):
created_at = Column(DateTime, index=True, default=None)
uri = Column(String(256), index=False)
url = Column(String(256), index=False)
content = Column(String(2048), index=False, default="")
content = Column(String(Config.MAX_CONTENT_LENGTH), index=False,
default="")
account_id = Column(Integer, ForeignKey('accounts.id'), nullable=True)
account = relationship("Accounts", back_populates="posts")
@ -232,11 +234,31 @@ class Posts(Base):
session.add(self)
session.commit()
@classmethod
def get_unrated_after(cls, session: Session,
post_id: int) -> Optional["Posts"]:
"""
Return earliest unrated Posts object after passed post_id, or None
if there isn't one.
"""
return (
session.scalars(
select(cls)
.where(
(cls.rating.is_(None)),
(cls.post_id > post_id)
)
.order_by(cls.post_id.asc())
.limit(1)
).first()
)
@classmethod
def get_unrated_before(cls, session: Session,
post_id: int) -> Optional["Posts"]:
"""
Return latest unrated Posts object before past post_id, or None
Return latest unrated Posts object before passed post_id, or None
if there isn't one.
"""
@ -259,7 +281,6 @@ class Posts(Base):
is not a boosted post, or None if there isn't one.
"""
print("get_unrated_newest")
return (
session.scalars(
select(cls)
@ -269,6 +290,36 @@ class Posts(Base):
).first()
)
@classmethod
def get_unrated_oldest(cls, session: Session) -> Optional["Posts"]:
"""
Return oldest Posts object that has not been rated and which
is not a boosted post, or None if there isn't one.
"""
return (
session.scalars(
select(cls)
.where(cls.rating.is_(None))
.order_by(cls.post_id.asc())
.limit(1)
).first()
)
@classmethod
def get_by_post_id(cls, session: Session, post_id: str) -> "Posts":
"""
Return post identified by post_id or None
"""
return (
session.scalars(
select(cls)
.where(cls.post_id == post_id)
.limit(1)
).first()
)
@classmethod
def get_or_create(cls, session: Session, post_id: str) -> "Posts":
"""
@ -287,6 +338,14 @@ class Posts(Base):
return rec
@staticmethod
def max_post_id(session):
"""
Return the maximum post_id
"""
return session.scalars(select(func.max(Posts.post_id))).first()
class PostTags(Base):
__tablename__ = 'post_tags'

View File

@ -65,7 +65,7 @@ p, li { white-space: pre-wrap; }
<property name="minimumSize">
<size>
<width>0</width>
<height>181</height>
<height>0</height>
</size>
</property>
<property name="frameShape">
@ -75,7 +75,7 @@ p, li { white-space: pre-wrap; }
<string/>
</property>
<property name="scaledContents">
<bool>true</bool>
<bool>false</bool>
</property>
</widget>
<widget class="QTextEdit" name="txtHashtags">

View File

@ -29,10 +29,10 @@ class Ui_MainWindow(object):
self.txtPost.setObjectName("txtPost")
self.lblPicture = QtWidgets.QLabel(self.centralwidget)
self.lblPicture.setGeometry(QtCore.QRect(10, 770, 351, 201))
self.lblPicture.setMinimumSize(QtCore.QSize(0, 181))
self.lblPicture.setMinimumSize(QtCore.QSize(0, 0))
self.lblPicture.setFrameShape(QtWidgets.QFrame.StyledPanel)
self.lblPicture.setText("")
self.lblPicture.setScaledContents(True)
self.lblPicture.setScaledContents(False)
self.lblPicture.setObjectName("lblPicture")
self.txtHashtags = QtWidgets.QTextEdit(self.centralwidget)
self.txtHashtags.setGeometry(QtCore.QRect(370, 90, 331, 871))

View File

@ -1,5 +1,6 @@
#! /usr/bin/env python
import datetime
import ipdb
import os
import pickle
@ -11,7 +12,7 @@ import sys
from config import Config
from dbconfig import engine, Session, scoped_session
from helpers import (
format_username,
format_display_name,
index_ojects_by_parameter,
send_mail,
)
@ -29,6 +30,7 @@ from models import (
from typing import List, Optional
from PyQt5.QtCore import Qt
from PyQt5.QtGui import (
QImage,
QPixmap,
@ -85,50 +87,16 @@ class MastodonAPI:
return self.mastodon.fetch_remaining(page1)
class UnratedPosts:
"""
Return unrated posts one at a time
"""
def __init__(self, session: Session) -> None:
self.dataset = Posts.get_unrated_posts(session)
self.pointer = None
def next(self) -> Posts:
# Set to first record if this is the first time we're called
if self.pointer is None:
self.pointer = 0
else:
self.pointer += 1
if self.pointer >= len(self.dataset):
# We've reached end of dataset
self.pointer = None
return None
else:
return self.dataset[self.pointer]
def prev(self) -> Posts:
# Set to last record if this is the first time we're called
if self.pointer is None:
self.pointer = len(self.dataset) - 1
else:
self.pointer -= 1
if self.pointer < 0:
# We've reached end of dataset
self.pointer = None
return None
else:
return self.dataset[self.pointer]
class Window(QMainWindow, Ui_MainWindow):
def __init__(self, parent=None) -> None:
super().__init__(parent)
self.setupUi(self)
# self.mastapi = MastodonAPI(Config.ACCESS_TOKEN)
self.mastapi = MastodonAPI(Config.ACCESS_TOKEN)
self.update_db()
self.current_post_id = None
self.next_post = self.next
self.btnDislike.clicked.connect(self.dislike)
self.btnFirst.clicked.connect(self.first)
@ -165,13 +133,13 @@ class Window(QMainWindow, Ui_MainWindow):
# Boosted
if boosted_by:
self.txtBoosted.setText(
"Boosted by: " + format_username(boosted_by))
"Boosted by: " + format_display_name(boosted_by))
self.txtBoosted.show()
else:
self.txtBoosted.hide()
# Username
self.txtUsername.setText(format_username(post.account))
self.txtUsername.setText(format_display_name(post.account))
# Debug
self.lblDebug.setText(str(post.id))
@ -199,21 +167,23 @@ class Window(QMainWindow, Ui_MainWindow):
# Image
if post.media_attachments:
image = QImage()
# TODO: handle multiple images, not just [0]
url_image = post.media_attachments[0].preview_url
image.loadFromData(requests.get(url_image).content)
self.lblPicture.setPixmap(QPixmap(image))
pixmap = QPixmap()
pixmap.loadFromData(requests.get(url_image).content)
s_pixmap = pixmap.scaled(self.lblPicture.size(),
Qt.KeepAspectRatio)
self.lblPicture.show()
self.lblPicture.setPixmap(s_pixmap)
else:
self.lblPicture.hide()
def dislike(self):
"""
actions
Mark a post as rated negatively
"""
pass
self.rate_post(rating=-1)
def first(self):
"""
@ -231,10 +201,10 @@ class Window(QMainWindow, Ui_MainWindow):
def like(self):
"""
actions
Mark a post as rated positively
"""
pass
self.rate_post(rating=1)
def next(self) -> None:
"""
@ -245,17 +215,20 @@ class Window(QMainWindow, Ui_MainWindow):
display newest unrated post.
"""
# Get post to display, but don't process posts that are boosted
# as they will be processed by the boosting post
# Remember whether we're going forward or backwards through
# posts
self.next_post = self.next
# Get post to display
with Session() as session:
if self.current_post_id is None:
post = Posts.get_unrated_newest(session)
while post and post.reblogged_by_post:
post = Posts.get_unrated_newest(session)
else:
post = Posts.get_unrated_before(session, self.current_post_id)
while post and post.reblogged_by_post:
post = Posts.get_unrated_before(session, post.post_id)
# Don't process posts that are boosted as they will be
# processed by the boosting post
while post and post.reblogged_by_post:
post = Posts.get_unrated_before(session, post.post_id)
if not post:
self.current_post_id = None
show_OK("All done", "No more posts to process")
@ -266,17 +239,169 @@ class Window(QMainWindow, Ui_MainWindow):
def prev(self):
"""
actions
Display previous post. We work BACKWARDS through posts so
"previous" is actually one newer.
If we are called with self.current_post_id set to None, retrieve and
display oldest unrated post.
"""
pass
# Remember whether we're going forward or backwards through
# posts
self.next_post = self.prev
# Get post to display, but don't process posts that are boosted
# as they will be processed by the boosting post
with Session() as session:
if self.current_post_id is None:
post = Posts.get_unrated_oldest(session)
else:
post = Posts.get_unrated_after(session, self.current_post_id)
# Don't process posts that are boosted as they will be
# processed by the boosting post
while post and post.reblogged_by_post:
post = Posts.get_unrated_after(session, post.post_id)
if not post:
self.current_post_id = None
show_OK("All done", "No more posts to process")
return
self.current_post_id = post.post_id
self.display(session, post)
def rate_post(self, rating: int) -> None:
"""
Add rating to current post
"""
with Session() as session:
post = Posts.get_by_post_id(session, self.current_post_id)
post.rating = rating
self.next_post()
def unsure(self):
"""
actions
Mark a post as rated neutrally
"""
pass
self.rate_post(rating=0)
def update_db(self) -> None:
"""
Update database from Mastodon
Save a copy of downloaded data for debugging
"""
with Session() as session:
minimum_post_id = Posts.max_post_id(session)
if not minimum_post_id:
minimum_post_id = "1"
posts_to_get = Config.MAX_POSTS_TO_FETCH
reached_minimum = False
hometl = []
while True:
# Create a filename to save data
now = datetime.datetime.now()
seq = 0
while True:
fname = (
"testdata/" +
now.strftime("%Y-%m-%d_%H:%M:%S_") +
f"{seq:02d}.pickle"
)
if not os.path.isfile(fname):
print(f"{fname=}")
break
seq += 1
print(f"{seq=}")
# Fetch data
if not hometl:
print("Fetching first data...")
hometl = self.mastapi.mastodon.timeline()
else:
print("Fetching next data...")
hometl = self.mastapi.mastodon.fetch_next(hometl)
print(f"Fetched additional {len(hometl)} posts")
with open(fname, "wb") as f:
pickle.dump(hometl, f)
for post in hometl:
if str(post.id) <= minimum_post_id:
reached_minimum = True
break
print(f"Processing {post.id=}")
self._process_post(session, post)
posts_to_get -= len(hometl)
print(f"{posts_to_get=}")
if posts_to_get <= 0 or reached_minimum or not hometl:
break
def _process_post(self, session: Session, post) -> Posts:
"""
Add passsed post to database
"""
log.debug(f"{post.id=} processing")
rec = Posts.get_or_create(session, str(post.id))
if rec.account_id is not None:
# We already have this post
log.debug(f"{post.id=} already in db")
return rec
# Create account record if needed
log.debug(f"{post.id=} processing {post.account.id=}")
account_rec = Accounts.get_or_create(session, str(post.account.id))
if account_rec.username is None:
log.debug(f"{post.id=} populating new account {post.account.id=}")
account_rec.username = post.account.username
account_rec.acct = post.account.acct
account_rec.display_name = post.account.display_name
account_rec.bot = post.account.bot
account_rec.url = post.account.url
rec.account_id = account_rec.id
# Create hashtag records as needed
for tag in post.tags:
log.debug(f"{post.id=} processing {tag.name=}")
hashtag = Hashtags.get_or_create(session, tag.name, tag.url)
rec.hashtags.append(hashtag)
# Handle media
if post.media_attachments:
for media in post.media_attachments:
log.debug(f"{post.id=} processing {media.id=}")
media_rec = Attachments.get_or_create(
session, str(media.id), rec.id)
if not media_rec.type:
log.debug(f"{post.id=} {media.id=} new record")
media_rec.type = media.type
media_rec.url = media.url
media_rec.preview_url = media.preview_url
media_rec.description = media.description
else:
log.debug(f"{post.id=} {media.id=} already exists")
else:
log.debug(f"{post.id=} No media attachments")
rec.account_id = account_rec.id
rec.created_at = post.created_at
rec.uri = post.uri
rec.url = post.url
rec.content = post.content[:Config.MAX_CONTENT_LENGTH]
log.debug(f"{post.id=} {post.content=}")
if post.reblog:
log.debug(f"{post.id=} {post.reblog.id=}")
rec.boosted_post_id = self._process_post(
session, post.reblog).id
log.debug(f"{post.id=} {rec.boosted_post_id=}")
return rec
def update_followed_accounts(self, session: Session) -> None:
"""
@ -346,54 +471,6 @@ class Window(QMainWindow, Ui_MainWindow):
# class HoldingPot:
# def process_post(post):
# rec = Posts.get_or_create(session, str(post.id))
# if rec.account_id is not None:
# # We already have this post
# return
#
# # Create account record if needed
# account_rec = Accounts.get_or_create(session, str(post.account.id))
# if account_rec.username is None:
# account_rec.username = post.account.username
# account_rec.acct = post.account.acct
# account_rec.display_name = post.account.display_name
# account_rec.bot = post.account.bot
# account_rec.url = post.account.url
# rec.account_id = account_rec.id
#
# # Create hashtag records as needed
# for tag in post.tags:
# hashtag = Hashtags.get_or_create(session, tag.name, tag.url)
# rec.hashtags.append(hashtag)
#
# # Handle media
# for media in post.media_attachments:
# media_rec = Attachments.get_or_create(session,
# str(media.id), rec.id)
# if not media_rec.type:
# media_rec.type = media.type
# media_rec.url = media.url
# media_rec.preview_url = media.preview_url
# media_rec.description = media.description
#
# rec.account_id = account_rec.id
# rec.created_at = post.created_at
# rec.uri = post.uri
# rec.url = post.url
# rec.content = post.content
#
# if post.reblogged_by_post:
# rec.boosted_post_id = process_post(post.reblogged_by_post).id
#
# return rec
#
# # Data for development
# with open(TESTDATA, "rb") as inp:
# hometl = pickle.load(inp)
#
# with Session() as session:
# for post in hometl:
# process_post(post)
if __name__ == "__main__":
@ -418,3 +495,11 @@ if __name__ == "__main__":
print("\033[1;31;47mUnhandled exception starts")
stackprinter.show(style="darkbg")
print("Unhandled exception ends\033[1;37;40m")
# # Data for development
# with open(TESTDATA, "rb") as inp:
# hometl = pickle.load(inp)
#
# with Session() as session:
# for post in hometl:
# process_post(post)