adsabs · tjacovich · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Dec 9, 2025
diff --git a/adsmp/app.py b/adsmp/app.py
@@ -181,22 +181,18 @@ def update_storage(self, bibcode, type, payload):
                     record.scix_id = "scix:" + str(self.generate_scix_id(record.bib_data))
                     out = record.toJSON()
                 session.commit()
-
-                # Send payload to Boost pipeline
-                if type != 'boost' and not self._config.get('TESTING_MODE', False):
-                    try:
-                        self.generate_boost_request_message(bibcode)
-                    except Exception as e:
-                        self.logger.exception('Error generating boost request message for bibcode %s: %s', bibcode, e)
-
                 return out
             except exc.IntegrityError:
                 self.logger.exception('error in app.update_storage while updating database for bibcode {}, type {}'.format(bibcode, type))
                 session.rollback()
                 raise
 
     def generate_scix_id(self, bib_data):
-        return scix_id.generate_scix_id(bib_data) 
+        if self._config.get('SCIX_ID_GENERATION_FIELDS', None):
+            user_fields = self._config.get('SCIX_ID_GENERATION_FIELDS')
+        else:
+            user_fields = None
+        return scix_id.generate_scix_id(bib_data, user_fields = user_fields) 
 
     def delete_by_bibcode(self, bibcode):
         with self.session_scope() as session:
@@ -894,7 +890,7 @@ def should_include_in_sitemap(self, record):
         3. If processed, processing isn't too stale
 
         Args:
-            record: Dictionary with record data including bib_data, status, timestamps
+            record: Dictionary with record data including has_bib_data, status, timestamps
 
         Returns:
             bool: True if record should be included in sitemap, False otherwise
@@ -903,14 +899,14 @@ def should_include_in_sitemap(self, record):
 
         # Extract values from record dictionary
         bibcode = record.get('bibcode', None)
-        bib_data = record.get('bib_data', None)
+        has_bib_data = record.get('has_bib_data', None)
         bib_data_updated = record.get('bib_data_updated')
         solr_processed = record.get('solr_processed') 
         status = record.get('status')
 
         # Must have bibliographic data
-        if not bib_data or not bibcode or (isinstance(bib_data, str) and not bib_data.strip()):
-            self.logger.debug('Excluding %s from sitemap: No bibcode or bib_data', bibcode)
+        if not has_bib_data or not bibcode:
+            self.logger.debug('Excluding %s from sitemap: No bibcode or has_bib_data is False', bibcode)
             return False
 
          # Exclude if SOLR failed or if record is being retried (previously failed)
@@ -959,6 +955,8 @@ def get_records_bulk(self, bibcodes, session, load_only=None):
             record_data = {}
             for field in (load_only or ['id', 'bibcode', 'bib_data', 'bib_data_updated', 'solr_processed', 'status']):
                 record_data[field] = getattr(record, field, None)
+            # Add has_bib_data boolean for sitemap checks
+            record_data['has_bib_data'] = bool(record_data.get('bib_data'))
             records_dict[record.bibcode] = record_data
 
         return records_dict

diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py
@@ -179,6 +179,7 @@ def extract_classifications_pipeline(db_classifications, solrdoc):
     """retrieve expected classifier collections
 
     classifications is a solr virtual field so it should never be set"""
+    db_classifications = [element for element in db_classifications if element] # remove empty strings
     if db_classifications is None or len(db_classifications) == 0:
         return {"database" : solrdoc.get("database", None)}
 

diff --git a/adsmp/tasks.py b/adsmp/tasks.py
@@ -42,10 +42,73 @@
     Queue('update-sitemap-files', app.exchange, routing_key='update-sitemap-files'),
     Queue('update-scixid', app.exchange, routing_key='update-scixid'),
     Queue('boost-request', app.exchange, routing_key='boost-request'),
+    Queue('augment-record', app.exchange, routing_key='augment-record'),
 )
 
 
 # ============================= TASKS ============================================= #
+@app.task(queue='augment-record')
+def task_augment_record(msg):
+    """Receives payload to augment the record.
+
+    @param msg: protobuff that contains at minimum
+        - bibcode
+        - and specific payload
+    """
+    # logger.debug('Updating record: %s', msg)
+    logger.debug('Updating record: %s', msg)
+    status = app.get_msg_status(msg)
+    logger.debug(f'Message status: {status}')
+    type = app.get_msg_type(msg)
+    logger.debug(f'Message type: {type}')
+    bibcodes = []
+
+    if status == 'active':
+        # save into a database
+        # passed msg may contain details on one bibcode or a list of bibcodes
+        if type == 'nonbib_records':
+            for m in msg.nonbib_records:
+                m = Msg(m, None, None) # m is a raw protobuf, TODO: return proper instance from .nonbib_records
+                bibcodes.append(m.bibcode)
+                record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
+                if record:
+                    logger.debug('Saved record from list: %s', record)
+        elif type == 'metrics_records':
+            for m in msg.metrics_records:
+                m = Msg(m, None, None)
+                bibcodes.append(m.bibcode)
+                record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
+                if record:
+                    logger.debug('Saved record from list: %s', record)
+        elif type == 'augment':
+            bibcodes.append(msg.bibcode)
+            record = app.update_storage(msg.bibcode, 'augment',
+                                        msg.toJSON(including_default_value_fields=True))
+            if record:
+                logger.debug('Saved augment message: %s', msg)
+        elif type == 'classify':
+            bibcodes.append(msg.bibcode)
+            logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
+            payload = msg.toJSON(including_default_value_fields=True)
+            payload = payload['collections']
+            record = app.update_storage(msg.bibcode, 'classify',payload)
+            if record:
+                logger.debug('Saved classify message: %s', msg)
+        else:
+            # here when record has a single bibcode
+            bibcodes.append(msg.bibcode)
+            record = app.update_storage(msg.bibcode, type, msg.toJSON())
+            if record:
+                logger.debug('Saved record: %s', record)
+        if record:                        
+            # Send payload to Boost pipeline
+            if type != 'boost' and not app._config.get('TESTING_MODE', False):
+                try:
+                    task_boost_request.apply_async(args=(msg.bibcode,))
+                except Exception as e:
+                    app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
+    else:
+        logger.error('Received a message with unclear status: %s', msg)
 
 @app.task(queue='update-record')
 def task_update_record(msg):
@@ -94,19 +157,22 @@ def task_update_record(msg):
                 record = app.update_storage(m.bibcode, 'nonbib_data', m.toJSON())
                 if record:
                     logger.debug('Saved record from list: %s', record)
+                    _generate_boost_request(m, type)
         elif type == 'metrics_records':
             for m in msg.metrics_records:
                 m = Msg(m, None, None)
                 bibcodes.append(m.bibcode)
                 record = app.update_storage(m.bibcode, 'metrics', m.toJSON(including_default_value_fields=True))
                 if record:
                     logger.debug('Saved record from list: %s', record)
+                    _generate_boost_request(m, type)
         elif type == 'augment':
             bibcodes.append(msg.bibcode)
             record = app.update_storage(msg.bibcode, 'augment',
                                         msg.toJSON(including_default_value_fields=True))
             if record:
                 logger.debug('Saved augment message: %s', msg)
+                _generate_boost_request(msg, type)
         elif type == 'classify':
             bibcodes.append(msg.bibcode)
             logger.debug(f'message to JSON: {msg.toJSON(including_default_value_fields=True)}')
@@ -115,21 +181,32 @@ def task_update_record(msg):
             record = app.update_storage(msg.bibcode, 'classify',payload)
             if record:
                 logger.debug('Saved classify message: %s', msg)
+                _generate_boost_request(msg, type)
         else:
             # here when record has a single bibcode
             bibcodes.append(msg.bibcode)
             record = app.update_storage(msg.bibcode, type, msg.toJSON())
             if record:
                 logger.debug('Saved record: %s', record)
+                _generate_boost_request(msg, type)
             if type == 'metadata':
                 # with new bib data we request to augment the affiliation
                 # that pipeline will eventually respond with a msg to task_update_record
                 logger.debug('requesting affilation augmentation for %s', msg.bibcode)
                 app.request_aff_augment(msg.bibcode)
-
     else:
         logger.error('Received a message with unclear status: %s', msg)
 
+def _generate_boost_request(msg, msg_type):
+    # Send payload to Boost pipeline
+    if msg_type not in app._config.get('IGNORED_BOOST_PAYLOAD_TYPES', ['boost']) and not app._config.get('TESTING_MODE', False):
+        try:
+            task_boost_request.apply_async(args=(msg.bibcode,))
+        except Exception as e:
+            app.logger.exception('Error generating boost request message for bibcode %s: %s', msg.bibcode, e)
+    else:
+        app.logger.debug("Message for bibcode %s has type: %s, Skipping.".format(msg.bibcode, msg_type))
+
 @app.task(queue='update-scixid')
 def task_update_scixid(bibcodes, flag):
     """Receives bibcodes to add scix id to the record.
@@ -428,7 +505,7 @@ def task_cleanup_invalid_sitemaps():
                 session.query(
                     SitemapInfo.id,  
                     SitemapInfo.bibcode,
-                    Records.bib_data,
+                    (Records.bib_data.isnot(None)).label('has_bib_data'),
                     Records.bib_data_updated,
                     Records.solr_processed,
                     Records.status
@@ -457,7 +534,7 @@ def task_cleanup_invalid_sitemaps():
                 # Convert to dict for should_include_in_sitemap function
                 record_dict = {
                     'bibcode': record_data.bibcode,
-                    'bib_data': record_data.bib_data,
+                    'has_bib_data': record_data.has_bib_data,
                     'bib_data_updated': record_data.bib_data_updated,
                     'solr_processed': record_data.solr_processed,
                     'status': record_data.status
@@ -626,7 +703,7 @@ def task_manage_sitemap(bibcodes, action):
                         # Apply SOLR filtering - convert record to dict for should_include_in_sitemap
                         record_dict = {
                             'bibcode': record.bibcode,
-                            'bib_data': record.bib_data,
+                            'has_bib_data': bool(record.bib_data),
                             'bib_data_updated': record.bib_data_updated,
                             'solr_processed': record.solr_processed,
                             'status': record.status
@@ -688,7 +765,6 @@ def task_manage_sitemap(bibcodes, action):
             logger.info('Bootstrap completed: %d successful, %d failed out of %d total records', 
                        successful_count, failed_count, processed)
             logger.info('All records marked with update_flag=True')
-            logger.info('Run --update-sitemap-files to generate sitemap XML files')
         return
 
     elif action in ['add', 'force-update']: