Commit 78077365 authored by André Melo's avatar André Melo
Browse files

Finish implementing image extraction from preprint tar

parent 06af8e37
......@@ -4,6 +4,6 @@ scan arxiv:
only:
- schedules
before_script:
- conda install -y tweepy poppler -c conda-forge
- conda install -y tweepy poppler pdf2image -c conda-forge
script:
- python jet-detection.py
......@@ -69,23 +69,19 @@ def preprint_ids(only_fresh=True):
]
# + Collapsed="false"
todays_ids = preprint_ids()
if not todays_ids:
sys.exit()
# + [markdown] Collapsed="false"
# ## Download PDFs
# + Collapsed="false"
ALLOWED_EXTS = ['png', 'jpg', 'jpeg', 'tiff']
IGNORE_EXTS = ['tex', 'bst', 'cls', 'bbl', 'dat', 'rtx']
def pdf_url(paper_id):
return f'https://www.arxiv.org/pdf/{paper_id}'
def preprint_url(paper_id):
return f'https://www.arxiv.org/e-print/{paper_id}'
def pdf_path(paper_id):
return PDF_DIR / f'{paper_id}.pdf'
def response(paper_id):
"""Request response objects corresponding to each preprint of the day"""
# TODO: use different mirrors randomly for better performance.
......@@ -98,16 +94,13 @@ def response(paper_id):
else:
raise RuntimeError(f'Unknown error, f{r.status_code}')
ALLOWED_EXTS = ['png', 'jpg', 'jpeg', 'tiff']
IGNORE_EXTS = ['tex', 'bst', 'cls', 'bbl']
def convert_to_rgb(img):
if img.mode in ['L', 'CMYK', 'HSV', 'RGB']:
return img.convert('RGB')
elif img.mode == 'RGBA':
# Create a white background, and paste the RGBA image
# on top of it with the alpha channel as the mask
background = Image.new('RGB', img.size, (255, 255, 255))
background = PIL.Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[-1])
return background
else:
......@@ -171,48 +164,10 @@ def extract_images(response):
elif content_type == 'application/x-eprint-tar':
with tarfile.open(fileobj=BytesIO(response.content)) as tar:
yield from extract_images_tar(tar)
else:
elif content_type != 'application/x-eprint':
logging.warning(f'Unknown content type {content_type}')
# + Collapsed="false"
today = date.today().isoformat().replace('-', '')
responses = []
for paper_id in tqdm.tqdm(todays_ids):
responses.append(response(paper_id))
# + Collapsed="false"
for response in responses:
content_type = response.headers['Content-Type']
if content_type == 'application/pdf':
continue
images = [img for img in extract_images(response)]
break
# + Collapsed="false"
len(images)
# + Collapsed="false"
images[0]
# + [markdown] Collapsed="false"
# # Extract images
# + Collapsed="false"
IMG_DIR = Path(f'{today}_imgs')
IMG_DIR.mkdir(exist_ok=True)
# + Collapsed="false"
# TODO: could run in parallel with downloading
for paper_id in tqdm.tqdm(todays_ids):
subprocess.call(
f'pdfimages -all {pdf_path(paper_id)} {IMG_DIR}/{paper_id}',
shell=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# + [markdown] Collapsed="false"
# # Colormap detection
......@@ -270,11 +225,23 @@ def contains_jet(pts, r_max):
return av_min_dist < r_max
# + [markdown] Collapsed="false"
# # Run detection
# + Collapsed="false"
todays_ids = preprint_ids()
if not todays_ids:
sys.exit()
today = date.today().isoformat().replace('-', '')
# + Collapsed="false"
jet_papers = []
for paper_id in tqdm.tqdm(todays_ids):
found_jet = False
for img in paper_images(paper_id):
for img in extract_images(response(paper_id)):
with img:
resized_img = img.resize((200, 200))
candidates = points_near_jet(resized_img, 0.2)
......@@ -364,7 +331,7 @@ evaluations = {
# Limit the number of preprints to 14, so that we don't exceed the max tweet length.
status_report = jinja2.Template("""arXiv/cond-mat STATUS REPORT {{date.strftime('%d/%m/%Y')}}
🚨🚨 JET COLORMAP DETECTED IN {{ jet_papers | length }} PREPRINTS 🚨🚨{{ ': ' if jet_papers else '.' }}
🚨🚨 JET DETECTED IN {{ jet_papers | length }} / {{ todays_ids | length }} PREPRINTS 🚨🚨{{ ': ' if jet_papers else '.' }}
{{- paper_links }}
{{ description }}
......@@ -372,7 +339,7 @@ status_report = jinja2.Template("""arXiv/cond-mat STATUS REPORT {{date.strftime(
# + Collapsed="false"
def prepare_tweet(jet_papers):
def prepare_tweet(todays_ids, jet_papers):
status_options = next(
options
for number, options in evaluations.items()
......@@ -382,7 +349,8 @@ def prepare_tweet(jet_papers):
# Compute how much space we have remaining
bare_length = len(
status_report.render(date=date.today(), jet_papers=jet_papers, description=description)
status_report.render(date=date.today(), jet_papers=jet_papers, description=description,
todays_ids=todays_ids)
)
remaining_chars = MAX_TWEET_LENGTH - bare_length
......@@ -400,7 +368,8 @@ def prepare_tweet(jet_papers):
paper_links = separator.join(jet_papers[:remaining_papers])
return status_report.render(
date=date.today(), jet_papers=jet_papers, description=description, paper_links=paper_links
date=date.today(), jet_papers=jet_papers, description=description, paper_links=paper_links,
todays_ids=todays_ids
)
......@@ -413,7 +382,7 @@ def prepare_tweet(jet_papers):
# tweet = prepare_tweet(jet_papers)
# + Collapsed="false"
tweet = prepare_tweet(jet_papers)
tweet = prepare_tweet(todays_ids, jet_papers)
print(tweet)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment