Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions adsmp/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,22 +181,18 @@ def update_storage(self, bibcode, type, payload):
record.scix_id = "scix:" + str(self.generate_scix_id(record.bib_data))
out = record.toJSON()
session.commit()

# Send payload to Boost pipeline
if type != 'boost' and not self._config.get('TESTING_MODE', False):
try:
self.generate_boost_request_message(bibcode)
except Exception as e:
self.logger.exception('Error generating boost request message for bibcode %s: %s', bibcode, e)

return out
except exc.IntegrityError:
self.logger.exception('error in app.update_storage while updating database for bibcode {}, type {}'.format(bibcode, type))
session.rollback()
raise

def generate_scix_id(self, bib_data):
return scix_id.generate_scix_id(bib_data)
if self._config.get('SCIX_ID_GENERATION_FIELDS', None):
user_fields = self._config.get('SCIX_ID_GENERATION_FIELDS')
else:
user_fields = None
return scix_id.generate_scix_id(bib_data, user_fields = user_fields)

def delete_by_bibcode(self, bibcode):
with self.session_scope() as session:
Expand Down Expand Up @@ -894,7 +890,7 @@ def should_include_in_sitemap(self, record):
3. If processed, processing isn't too stale

Args:
record: Dictionary with record data including bib_data, status, timestamps
record: Dictionary with record data including has_bib_data, status, timestamps

Returns:
bool: True if record should be included in sitemap, False otherwise
Expand All @@ -903,14 +899,14 @@ def should_include_in_sitemap(self, record):

# Extract values from record dictionary
bibcode = record.get('bibcode', None)
bib_data = record.get('bib_data', None)
has_bib_data = record.get('has_bib_data', None)
bib_data_updated = record.get('bib_data_updated')
solr_processed = record.get('solr_processed')
status = record.get('status')

# Must have bibliographic data
if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()):
self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode)
if not has_bib_data or not bibcode:
self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode)
return False

# Exclude if SOLR failed or if record is being retried (previously failed)
Expand Down Expand Up @@ -959,6 +955,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None):
record_data = {}
for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']):
record_data[field] = getattr(record, field, None)
# Add has_bib_data boolean for sitemap checks
record_data['has_bib_data'] = bool(record_data.get('bib_data'))
records_dict[record.bibcode] = record_data

return records_dict
Expand Down
1 change: 1 addition & 0 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def extract_classifications_pipeline(db_classifications, solrdoc):
"""retrieve expected classifier collections

classifications is a solr virtual field so it should never be set"""
db_classifications = [element for element in db_classifications if element] # remove empty strings
if db_classifications is None or len(db_classifications) == 0:
return {"database" : solrdoc.get("database", None)}

Expand Down
86 changes: 81 additions & 5 deletions adsmp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,73 @@
Queue('update-sitemap-files', app.exchange, routing_key='update-sitemap-files'),
Queue('update-scixid', app.exchange, routing_key='update-scixid'),
Queue('boost-request', app.exchange, routing_key='boost-request'),
Queue('augment-record', app.exchange, routing_key='augment-record'),
)


# ============================= TASKS ============================================= #
@app.task(queue='augment-record')
def task_augment_record(msg):
"""Receives payload to augment the record.

@param msg: protobuff that contains at minimum
- bibcode
- and specific payload
"""
# logger.debug('Updating record: %s', msg)
logger.debug('Updating record: %s', msg)
status = app.get_msg_status(msg)
logger.debug(f'Message status: {status}')
type = app.get_msg_type(msg)
logger.debug(f'Message type: {type}')
bibcodes = []

if status == 'active':
# save into a database
# passed msg may contain details on one bibcode or a list of bibcodes
if type == 'nonbib_records':
for m in msg.nonbib_records:
m = Msg(m, None, None) # m is a raw protobuf, TODO: return proper instance from .nonbib_records
bibcodes.append(m.bibcode)
record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
if record:
logger.debug('Saved record from list: %s', record)
elif type == 'metrics_records':
for m in msg.metrics_records:
m = Msg(m, None, None)
bibcodes.append(m.bibcode)
record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved record from list: %s', record)
elif type == 'augment':
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, 'augment',
msg.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved augment message: %s', msg)
elif type == 'classify':
bibcodes.append(msg.bibcode)
logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
payload = msg.toJSON(including_default_value_fields=True)
payload = payload['collections']
record = app.update_storage(msg.bibcode, 'classify',payload)
if record:
logger.debug('Saved classify message: %s', msg)
else:
# here when record has a single bibcode
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, type, msg.toJSON())
if record:
logger.debug('Saved record: %s', record)
if record:
# Send payload to Boost pipeline
if type != 'boost' and not app._config.get('TESTING_MODE', False):
try:
task_boost_request.apply_async(args=(msg.bibcode,))
except Exception as e:
app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
else:
logger.error('Received a message with unclear status: %s', msg)

@app.task(queue='update-record')
def task_update_record(msg):
Expand Down Expand Up @@ -94,19 +157,22 @@ def task_update_record(msg):
record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
if record:
logger.debug('Saved record from list: %s', record)
_generate_boost_request(m, type)
elif type == 'metrics_records':
for m in msg.metrics_records:
m = Msg(m, None, None)
bibcodes.append(m.bibcode)
record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved record from list: %s', record)
_generate_boost_request(m, type)
elif type == 'augment':
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, 'augment',
msg.toJSON(including_default_value_fields=True))
if record:
logger.debug('Saved augment message: %s', msg)
_generate_boost_request(msg, type)
elif type == 'classify':
bibcodes.append(msg.bibcode)
logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
Expand All @@ -115,21 +181,32 @@ def task_update_record(msg):
record = app.update_storage(msg.bibcode, 'classify',payload)
if record:
logger.debug('Saved classify message: %s', msg)
_generate_boost_request(msg, type)
else:
# here when record has a single bibcode
bibcodes.append(msg.bibcode)
record = app.update_storage(msg.bibcode, type, msg.toJSON())
if record:
logger.debug('Saved record: %s', record)
_generate_boost_request(msg, type)
if type == 'metadata':
# with new bib data we request to augment the affiliation
# that pipeline will eventually respond with a msg to task_update_record
logger.debug('requesting affilation augmentation for %s', msg.bibcode)
app.request_aff_augment(msg.bibcode)

else:
logger.error('Received a message with unclear status: %s', msg)

def _generate_boost_request(msg, msg_type):
# Send payload to Boost pipeline
if msg_type not in app._config.get('IGNORED_BOOST_PAYLOAD_TYPES', ['boost']) and not app._config.get('TESTING_MODE', False):
try:
task_boost_request.apply_async(args=(msg.bibcode,))
except Exception as e:
app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
else:
app.logger.debug("Message for bibcode %s has type: %s, Skipping.".format(msg.bibcode, msg_type))

@app.task(queue='update-scixid')
def task_update_scixid(bibcodes, flag):
"""Receives bibcodes to add scix id to the record.
Expand Down Expand Up @@ -428,7 +505,7 @@ def task_cleanup_invalid_sitemaps():
session.query(
SitemapInfo.id,
SitemapInfo.bibcode,
Records.bib_data,
(Records.bib_data.isnot(None)).label('has_bib_data'),
Records.bib_data_updated,
Records.solr_processed,
Records.status
Expand Down Expand Up @@ -457,7 +534,7 @@ def task_cleanup_invalid_sitemaps():
# Convert to dict for should_include_in_sitemap function
record_dict = {
'bibcode': record_data.bibcode,
'bib_data': record_data.bib_data,
'has_bib_data': record_data.has_bib_data,
'bib_data_updated': record_data.bib_data_updated,
'solr_processed': record_data.solr_processed,
'status': record_data.status
Expand Down Expand Up @@ -626,7 +703,7 @@ def task_manage_sitemap(bibcodes, action):
# Apply SOLR filtering - convert record to dict for should_include_in_sitemap
record_dict = {
'bibcode': record.bibcode,
'bib_data': record.bib_data,
'has_bib_data': bool(record.bib_data),
'bib_data_updated': record.bib_data_updated,
'solr_processed': record.solr_processed,
'status': record.status
Expand Down Expand Up @@ -688,7 +765,6 @@ def task_manage_sitemap(bibcodes, action):
logger.info('Bootstrap completed: %d successful, %d failed out of %d total records',
successful_count, failed_count, processed)
logger.info('All records marked with update_flag=True')
logger.info('Run --update-sitemap-files to generate sitemap XML files')
return

elif action in ['add', 'force-update']:
Expand Down
Loading