Commit c1aeeeb8 authored by André Melo's avatar André Melo
Browse files

Detect when images have wrong extension

parent fd8b3040
......@@ -4,6 +4,6 @@ scan arxiv:
only:
- schedules
before_script:
- conda install -y tweepy poppler pdf2image -c conda-forge
- conda install -y tweepy poppler pdf2image filetype -c conda-forge
script:
- python jet-detection.py
......@@ -8,9 +8,9 @@
# format_version: '1.5'
# jupytext_version: 1.3.4
# kernelspec:
# display_name: Python 3
# display_name: Python [conda env:.conda-plotocop]
# language: python
# name: python3
# name: conda-env-.conda-plotocop-py
# ---
# + [markdown] Collapsed="false"
......@@ -31,6 +31,7 @@ import subprocess
import os
import sys
import logging
import filetype
# Web-related
import requests
......@@ -74,7 +75,7 @@ def preprint_ids(only_fresh=True):
# + Collapsed="false"
ALLOWED_EXTS = ['png', 'jpg', 'jpeg', 'tiff']
IGNORE_EXTS = ['tex', 'bst', 'cls', 'bbl', 'dat', 'rtx']
IGNORE_EXTS = ['tex', 'bst', 'cls', 'bbl', 'dat', 'rtx', 'aux', 'bib', 'sty']
def pdf_url(paper_id):
return f'https://www.arxiv.org/pdf/{paper_id}'
......@@ -143,6 +144,10 @@ def extract_images_pdf(pdf):
def extract_images_tar(tar):
for member in tar.getmembers():
suffix = Path(member.name).suffix[1:]
# Check for images with wrong extensions, e.g.
guess = filetype.guess(tar.extractfile(member))
if guess and guess.extension != suffix:
suffix = guess.extension
if not member.isfile():
continue
if suffix in ALLOWED_EXTS:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment