[Feature] Update time series (#13)

- [Feature] update time series model (d9bead416bc2e29ee7e2d4f964840a5699007664)
- [Weight] Update time series safetensors (262600bfe6a5013e9407e3f6b7be303a3dbdce42)
- [test] update time series test scripts (bbe1e5b60f58e4587d421dca5d74cea85e17e891)
- [Fix] Remove hard requirements of pandas (f2a60cf1369f53de255813688aad3557d0957391)

Co-authored-by: yehaochen <yehaochen@users.noreply.huggingface.co>

Files changed (10) hide show

0092638_seism.npy +3 -0
chat_template.jinja +2 -0
config.json +32 -0
configuration_interns1_pro.py +52 -2
model-time_series-00001-of-00002.safetensors +3 -0
model-time_series-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +2 -2
modeling_interns1_pro.py +503 -4
processing_interns1_pro.py +147 -2
test_inference_ts.py +78 -0

0092638_seism.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b94653c6964b630038897a27cb6d276ff866d9ecd1f6419358b9407f0df62e
+size 72128

chat_template.jinja CHANGED Viewed

@@ -17,6 +17,8 @@
                 {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
             {%- elif 'text' in item %}
                 {{- item.text }}
             {%- endif %}
         {%- endfor %}
     {%- endif %}

                 {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
             {%- elif 'text' in item %}
                 {{- item.text }}
+            {%- elif 'time_series' in item or item.type == 'time_series' %}
+                {{- '<|ts|><TS_CONTEXT><|/ts|>'-}}
             {%- endif %}
         {%- endfor %}
     {%- endif %}

config.json CHANGED Viewed

@@ -58,6 +58,37 @@
   },
   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
   "auto_map": {
     "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
     "AutoModel": "modeling_interns1_pro.InternS1ProModel",
@@ -141,6 +172,7 @@
       "model.visual.blocks.17.mlp.linear_fc1",
       "model.visual.blocks.4.norm2",
       "model.visual.blocks.17.attn.qkv",
       "model.language_model.layers.83.self_attn.k_norm",
       "model.language_model.layers.47.post_attention_layernorm",
       "model.language_model.layers.59.input_layernorm",

   },
   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
+  "ts_config": {
+    "auto_map": {
+      "AutoConfig": "configuration_interns1_pro.InternS1ProTimeSeriesConfig",
+      "AutoModel": "modeling_interns1_pro.InternS1ProTimeSeriesModel"
+    },
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "architectures": [
+      "InternS1TimeSeriesModel"
+    ],
+    "attention_dropout": 0.0,
+    "d_model": 768,
+    "dropout": 0.0,
+    "dtype": "bfloat16",
+    "encoder_attention_heads": 8,
+    "encoder_ffn_dim": 3072,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 17,
+    "model_type": "interns1_pro_time_series",
+    "max_source_positions": 1500,
+    "num_mel_bins": 80,
+    "out_hidden_size": 4096,
+    "scale_embedding": false,
+    "ts_adapt_in_dim": 256,
+    "ts_adapt_out_dim": 1024,
+    "use_cache": true,
+    "attn_implementation": "eager"
+  },
+  "ts_end_id": 151684,
+  "ts_start_id": 151683,
+  "ts_token_id": 151685,
   "auto_map": {
     "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
     "AutoModel": "modeling_interns1_pro.InternS1ProModel",
       "model.visual.blocks.17.mlp.linear_fc1",
       "model.visual.blocks.4.norm2",
       "model.visual.blocks.17.attn.qkv",
+      "model.time_series",
       "model.language_model.layers.83.self_attn.k_norm",
       "model.language_model.layers.47.post_attention_layernorm",
       "model.language_model.layers.59.input_layernorm",

configuration_interns1_pro.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 class InternS1ProTextConfig(PretrainedConfig):
@@ -138,20 +139,61 @@ class InternS1ProVisionConfig(PretrainedConfig):
         self.num_position_embeddings = num_position_embeddings
         self.initializer_range = initializer_range
 class InternS1ProConfig(PretrainedConfig):
     model_type = "interns1_pro"
-    sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         text_config=None,
         vision_config=None,
         image_token_id=151655,
         video_token_id=151656,
         vision_start_token_id=151652,
         vision_end_token_id=151653,
         tie_word_embeddings=False,
         **kwargs,
     ):
@@ -165,11 +207,19 @@ class InternS1ProConfig(PretrainedConfig):
         elif text_config is None:
             self.text_config = self.sub_configs["text_config"]()
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
         self.vision_start_token_id = vision_start_token_id
         self.vision_end_token_id = vision_end_token_id
         super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
-__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"]

 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
+from transformers import WhisperConfig
 class InternS1ProTextConfig(PretrainedConfig):
         self.num_position_embeddings = num_position_embeddings
         self.initializer_range = initializer_range
+class InternS1ProTimeSeriesConfig(WhisperConfig):
+    model_type = "interns1_pro_time_series"
+    base_config_key = "ts_config"
+    def __init__(
+        self,
+        ts_adapt_in_dim: int=256,
+        ts_adapt_out_dim: int=1024,
+        ts_hidden_dim: int=1024,
+        ts_cnn_channels: list[int]=[1, 32, 64, 128, 128],
+        ts_cnn_kernel_sizes: list[int]=[3, 5, 5, 5],
+        ts_cnn_strides: list[int]=[2, 4, 4, 5],
+        ts_cnn_paddings: list[int]=[1, 2, 2, 2],
+        ts_concat_subsampling_in_channels: int=128,
+        ts_concat_subsampling_concat_size: int=2,
+        use_flash_attn: bool=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.ts_cnn_channels = ts_cnn_channels
+        self.ts_cnn_kernel_sizes = ts_cnn_kernel_sizes
+        self.ts_cnn_strides = ts_cnn_strides
+        self.ts_cnn_paddings = ts_cnn_paddings
+        self.ts_concat_subsampling_in_channels = ts_concat_subsampling_in_channels
+        self.ts_concat_subsampling_concat_size = ts_concat_subsampling_concat_size
+        self.ts_adapt_in_dim = ts_adapt_in_dim
+        self.ts_adapt_out_dim = ts_adapt_out_dim
+        self.ts_hidden_dim = ts_hidden_dim
+        self.use_flash_attn = use_flash_attn
+        assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
+        assert self.ts_concat_subsampling_in_channels == self.ts_cnn_channels[-1], "ts_concat_subsampling_in_channels should be equal to the out_channel of the last cnn layer"
 class InternS1ProConfig(PretrainedConfig):
     model_type = "interns1_pro"
+    sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig, 'ts_config':InternS1ProTimeSeriesConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         text_config=None,
         vision_config=None,
+        ts_config=None,
         image_token_id=151655,
         video_token_id=151656,
         vision_start_token_id=151652,
         vision_end_token_id=151653,
+        ts_token_id=151685,
+        ts_start_id=151683,
+        ts_end_id=151684,
         tie_word_embeddings=False,
         **kwargs,
     ):
         elif text_config is None:
             self.text_config = self.sub_configs["text_config"]()
+        if isinstance(ts_config, dict):
+            self.ts_config = self.sub_configs["ts_config"](**ts_config)
+        elif ts_config is None:
+            self.ts_config = self.sub_configs["ts_config"]()
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
         self.vision_start_token_id = vision_start_token_id
         self.vision_end_token_id = vision_end_token_id
+        self.ts_token_id = ts_token_id
+        self.ts_start_id = ts_start_id
+        self.ts_end_id = ts_end_id
         super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig", "InternS1ProTimeSeriesConfig"]

model-time_series-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fab87c45c01a8695f97b5801bee2771ac6e874561ac773983397d958f1e7a00
+size 291982664

model-time_series-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4150fadfb90bd9561c422b37ecc83fd5a30966f1e555bc9305b9fd5d2c914b0d
+size 10240128

model.safetensors.index.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7de640c8e6f374c36de64b925b2c107896731ef642283e490e69125ec5c4eac1
-size 32204741

 version https://git-lfs.github.com/spec/v1
+oid sha256:6aa1acb6e462542ccb55d50c9ba2097df081b6fd69b8ac5aaed1f0b30b14678e
+size 32236540

modeling_interns1_pro.py CHANGED Viewed

@@ -34,8 +34,10 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
 from transformers.utils.generic import OutputRecorder, check_model_inputs
-from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
 @use_kernel_forward_from_hub("RMSNorm")
 class Qwen3VLMoeTextRMSNorm(nn.Module):
@@ -439,7 +441,7 @@ class InternS1ProPreTrainedModel(PreTrainedModel):
     config: InternS1ProConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn = True
     _supports_sdpa = True
@@ -1057,6 +1059,442 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
         )
 @dataclass
 @auto_docstring(
     custom_intro="""
@@ -1118,12 +1556,13 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
     # Reference: fix gemma3 grad acc #37208
     accepts_loss_kwargs = False
     config: InternS1ProConfig
-    _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
     def __init__(self, config):
         super().__init__(config)
         self.visual = InternS1ProVisionModel._from_config(config.vision_config)
         self.language_model = InternS1ProTextModel._from_config(config.text_config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1170,6 +1609,15 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
         split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
         image_embeds = torch.split(image_embeds, split_sizes)
         return image_embeds
     def get_placeholder_mask(
         self,
@@ -1225,6 +1673,9 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
         r"""
@@ -1232,6 +1683,12 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
         """
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1258,6 +1715,27 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
             )
             inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
         if position_ids is None:
             batch_size, seq_length = inputs_embeds.shape[:2]
             if cache_position is not None:
@@ -1396,6 +1874,8 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
     def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
         return self.model.get_image_features(pixel_values, image_grid_thw)
     # Make modules available through conditional class for BC
     @property
     def language_model(self):
@@ -1404,6 +1884,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
     @property
     def visual(self):
         return self.model.visual
     @check_model_inputs
     def forward(
@@ -1418,6 +1901,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
@@ -1484,6 +1970,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             cache_position=cache_position,
             **kwargs,
         )
@@ -1530,6 +2019,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
         pixel_values_videos=None,
         image_grid_thw=None,
         video_grid_thw=None,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1546,6 +2038,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
             use_cache=use_cache,
             **kwargs,
         )
@@ -1554,6 +2049,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
         if cache_position[0] != 0:
             model_inputs["pixel_values"] = None
             model_inputs["pixel_values_videos"] = None
         return model_inputs
@@ -1697,6 +2195,7 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
 __all__ = [
     "InternS1ProVisionModel",
     "InternS1ProForConditionalGeneration",
     "InternS1ProModel",
     "InternS1ProPreTrainedModel",

 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
 from transformers.utils.generic import OutputRecorder, check_model_inputs
+from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig, InternS1ProTimeSeriesConfig
+from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
+from transformers import WhisperPreTrainedModel
+import math
 @use_kernel_forward_from_hub("RMSNorm")
 class Qwen3VLMoeTextRMSNorm(nn.Module):
     config: InternS1ProConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn = True
     _supports_sdpa = True
         )
+class InternS1ProTimeSeriesEncoder(WhisperPreTrainedModel):
+    def __init__(self, config: InternS1ProTimeSeriesConfig):
+        super().__init__(config)
+        self.config = config
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
+        self.conv1 = nn.Conv1d(self.num_mel_bins, self.embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=3, stride=2, padding=1)
+        self.embed_positions = nn.Embedding(self.max_source_positions, self.embed_dim)
+        self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        self.post_init()
+        self.mask_type = None
+        self.chunk_length = None
+        self.adapt_in = nn.Linear(config.ts_adapt_in_dim, 80)
+        self.adapt_out = nn.Linear(self.embed_dim, config.ts_adapt_out_dim)
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+    def define_masktype(self, masktype, chunk_length=None):
+        self.mask_type = masktype
+        self.chunk_length = chunk_length
+    def _make_causal_mask(self,
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+    ):
+        """
+        Make causal mask used for bi-directional self-attention.
+        """
+        bsz, tgt_len = input_ids_shape
+        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+        mask_cond = torch.arange(mask.size(-1), device=device)
+        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+        mask = mask.to(dtype)
+        if past_key_values_length > 0:
+            mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+        return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+    # Copied from transformers.models.bart.modeling_bart._expand_mask
+    def _expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        # print(mask.size())
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        inverted_mask = 1.0 - expanded_mask
+        return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = self._make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = self._expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def prepare_chunk_attention_mask(self, attention_mask, input_shape, inputs_embeds):
+        block_size = round(self.chunk_length / 4 * 2)
+        matrix_size = input_shape[1]
+        matrix = torch.ones(matrix_size, matrix_size)
+        num_full_blocks = round(matrix_size // block_size)
+        remainder = matrix_size % block_size
+        for i in range(num_full_blocks):
+            row_start = i * block_size
+            col_start = i * block_size
+            matrix[row_start:row_start + block_size, col_start:col_start + block_size] = torch.zeros(block_size, block_size)
+        if remainder > 0:
+            last_row_start = num_full_blocks * block_size
+            last_col_start = num_full_blocks * block_size
+            matrix[last_row_start:last_row_start + remainder, last_col_start:last_col_start + remainder] = torch.zeros(remainder, remainder)
+        matrix = matrix * -65504
+        matrix = matrix.unsqueeze(0).unsqueeze(0).repeat(input_shape[0], 1, 1, 1)
+        attention_mask = matrix.to(inputs_embeds.device)
+        return attention_mask
+    def forward(
+            self,
+            input_features,
+            attention_mask=None,
+            head_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        # (N, T, C) -> (T, N, C) -> (N, C, T)
+        input_features = input_features.permute(1, 0, 2)
+        input_features = self.adapt_in(input_features)
+        input_features = input_features.permute(1, 2, 0)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # (N, C, T) -> (N, C, T//2)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        # (N, C, T) -> (N, T, C)
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)  # torch.Size([1, 100, 768])
+        embed_pos = self.embed_positions.weight         # torch.Size([1500, 768])
+        if inputs_embeds.shape[1] > embed_pos.shape[0]:
+            target_len = inputs_embeds.shape[1]
+            padding = [0, 0, 0, target_len-embed_pos.shape[0]]
+            embed_pos = nn.functional.pad(embed_pos, pad=padding, mode='constant', value=0)
+            hidden_states = inputs_embeds[:, :embed_pos.shape[0], :] + embed_pos
+        else:
+            hidden_states = inputs_embeds + embed_pos[:inputs_embeds.shape[1], :]
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        input_shape = inputs_embeds.size()[:-1]
+        past_key_values_length = 0
+        attention_mask = None
+        if self.mask_type == 'chunk':
+            attention_mask = self.prepare_chunk_attention_mask(attention_mask, input_shape, inputs_embeds)
+        else:
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (self.layer_norm(hidden_states),)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # (N, T, C) -> (T, N, C)
+        hidden_states = hidden_states.permute(1, 0, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.adapt_out(hidden_states)
+        # (T, N, C) -> (N, T, C)
+        hidden_states = hidden_states.permute(1, 0, 2)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return ModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class InternS1ProTimeSeriesConcatSubsampling(nn.Module):
+    def __init__(self, in_channels: int, concat_size: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels * concat_size
+    def forward(self, ts_signals: torch.Tensor, ts_lens: torch.Tensor):
+        if ts_signals.shape[1] % 2 != 0:
+            ts_signals = ts_signals[:, :-1, :]
+        even_frames = ts_signals[:, ::2, :]
+        odd_frames = ts_signals[:, 1::2, :]
+        ts_signals = torch.cat((even_frames, odd_frames), dim=2)
+        ts_lens = ts_lens // 2
+        return ts_signals, ts_lens
+class InternS1ProTimeSeriesFixPositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=20000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model,dtype=torch.float)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1).to(torch.float32) # (max_len, 1, d_model)
+        self.register_buffer('pe', pe, persistent=True)
+    def forward(self, x):
+        # x: (seq_len, batch_size, d_model)
+        x = x + self.pe[:x.size(0), :]
+        return x.clone()
+class InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling(nn.Module):
+    def __init__(self, hidden_dim=128, nhead=8,num_encoder_layers = 1):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=5, stride=1, padding=2)
+        encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
+        self.pos_encoder = InternS1ProTimeSeriesFixPositionalEncoding(d_model=hidden_dim)
+        self.subsampling = InternS1ProTimeSeriesConcatSubsampling(128, 2)
+    def forward(self, inputs, input_lens, sr):
+        features, feature_lens = self.forward_patch(inputs, input_lens, sr)
+        outputs = features
+        output_lens = feature_lens
+        return outputs, output_lens
+    def forward_patch(self, inputs, input_lens, sr):
+        sr = sr.float()
+        strides = torch.floor(160/((1+torch.exp(-sr/100))**6))
+        patch_sizes = strides * 2
+        patched_outputs = []
+        output_lens = []
+        for i in range(len(inputs)):
+            seq = inputs[i] # [seq_len, num_channel]
+            ps = patch_sizes[i].item()
+            st = strides[i].item()
+            le = input_lens[i]
+            output_len = torch.ceil((le - ps) / st) + 1
+            pad_len = ((output_len - 1) * st + ps - le).long().item()
+            if seq.ndim == 1:
+                seq = seq.unsqueeze(-1)
+            seq = nn.functional.pad(seq, (0, 0, 0, pad_len), "constant", 0)
+            assert output_len > 0,(seq.shape, ps,st,le,output_len)
+            output_lens.append(output_len)
+            indices = (torch.arange(0, output_len * st, st).unsqueeze(1) + torch.arange(ps)).long()
+            patched = seq[indices]
+            output = self.forward_encoder(patched)    #[num_patch, D]
+            patched_outputs.append(output)
+        outputs = nn.utils.rnn.pad_sequence(patched_outputs, batch_first=True)
+        output_lens = torch.tensor(output_lens).squeeze().to(outputs.device).long()
+        if output_lens.ndim == 0:
+            output_lens = output_lens.unsqueeze(0)
+        outputs, output_lens = self.subsampling(outputs.clone(), output_lens.clone())
+        return outputs, output_lens
+    def forward_encoder(self, x):
+        num_patch, patch_len, C = x.shape
+        # conv1
+        x = x.reshape(num_patch*C, 1, patch_len)    # 每个 channel 当作独立样本送入 conv1
+        x = nn.functional.relu((self.conv(x))) # [B*C, D1, L]
+        x = x.permute(2,0,1)    # [L, B*C, D1]
+        x = self.pos_encoder(x) # [L, B*C, D1]
+        x = self.transformer_encoder(x.to(torch.bfloat16))
+        x = x.mean(0)
+        x = x.reshape(num_patch,C,-1)
+        return x.mean(1)
+class InternS1ProTimeSeriesProjector(nn.Module):
+    def __init__(self, config: InternS1ProTimeSeriesConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.ts_hidden_dim)
+        self.linear_1 = nn.Linear(config.ts_hidden_dim, config.out_hidden_size)
+        self.act = ACT2FN[config.activation_function]
+        self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size)
+    def forward(self, ts_features):
+        hidden_states = self.layer_norm(ts_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class InternS1ProTimeSeriesModel(InternS1ProPreTrainedModel):
+    main_input_name = 'time_series_signals'
+    _supports_flash_attn_2 = False
+    config_class = InternS1ProTimeSeriesConfig
+    _no_split_modules = ['WhisperEncoderLayer']
+    def __init__(self, config: InternS1ProTimeSeriesConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder_embed = InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling()
+        self.encoder = InternS1ProTimeSeriesEncoder(config)
+        self.projector = InternS1ProTimeSeriesProjector(config)
+    def get_input_embeddings(self):
+        return self.encoder_embed
+    def make_pad_mask(self, lengths: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+        lengths:
+            A 1-D tensor containing sentence lengths.
+        max_len:
+            The length of masks.
+        Returns:
+        Return a 2-D bool tensor, where masked positions
+        are filled with `True` and non-masked positions are
+        filled with `False`.
+        >>> lengths = torch.tensor([1, 3, 2, 5])
+        >>> make_pad_mask(lengths)
+        tensor([[False,  True,  True,  True,  True],
+                [False, False, False,  True,  True],
+                [False, False,  True,  True,  True],
+                [False, False, False, False, False]])
+        """
+        assert lengths.ndim == 1, lengths.ndim
+        max_len = lengths.max()
+        n = lengths.size(0)
+        seq_range = torch.arange(0, max_len, device=lengths.device)
+        expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+        return expaned_lengths >= lengths.unsqueeze(-1)
+    def forward(
+            self,
+            time_series_signals: Optional[torch.FloatTensor] = None,
+            ts_lens: Optional[torch.Tensor] = None,
+            sr: Optional[torch.Tensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            time_series_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if time_series_signals is None and time_series_embeds is None:
+            raise ValueError('You have to specify time_series_signals or time_series_embeds')
+        if time_series_embeds is not None and len(time_series_embeds.shape) == 3 and time_series_embeds.shape[-1] == self.config.ts_adapt_in_dim:
+            time_series_embeds = time_series_embeds
+        else:
+            if (isinstance(time_series_signals,list) and len(time_series_signals[0].shape) == 2) \
+                or (isinstance(time_series_signals, torch.Tensor) and  len(time_series_signals.shape) == 3):
+                time_series_embeds, ts_lens = self.encoder_embed(time_series_signals, ts_lens, sr)
+            else:
+                raise ValueError(f'wrong time_series_signals size: {time_series_signals[0].shape}')
+        # [B, 64000, 1] -> [B, 200, 256] -> [B, 100, 1024]
+        encoder_outputs = self.encoder(
+            input_features=time_series_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # ts_lens after encoder
+        ts_lens = (ts_lens+1) // 2
+        assert torch.all(ts_lens > 0), f"The length of time_series_embeds is so small. ts_lens: {ts_lens}"
+        src_key_padding_mask = self.make_pad_mask(ts_lens)
+        last_hidden_state = encoder_outputs.last_hidden_state
+        ts_pad_mask = src_key_padding_mask
+        ts_embeds = self.projector(last_hidden_state)
+        return ts_embeds,ts_pad_mask
 @dataclass
 @auto_docstring(
     custom_intro="""
     # Reference: fix gemma3 grad acc #37208
     accepts_loss_kwargs = False
     config: InternS1ProConfig
+    _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
     def __init__(self, config):
         super().__init__(config)
         self.visual = InternS1ProVisionModel._from_config(config.vision_config)
         self.language_model = InternS1ProTextModel._from_config(config.text_config)
+        self.time_series = InternS1ProTimeSeriesModel._from_config(config.ts_config)
         # Initialize weights and apply final processing
         self.post_init()
         split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
         image_embeds = torch.split(image_embeds, split_sizes)
         return image_embeds
+    def get_ts_feature(self, ts_values, ts_lens, sr):
+        ts_embeds, ts_pad_mask = self.time_series(
+            time_series_signals=ts_values,
+            ts_lens=ts_lens,
+            sr=sr,
+            output_hidden_states=False,
+            return_dict=True)
+        return ts_embeds, ts_pad_mask
     def get_placeholder_mask(
         self,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        ts_values: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
+        ts_lens: Union[torch.Tensor, list[torch.Tensor]] = None,
+        ts_sr: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
         r"""
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
+        ts_values (`torch.FloatTensor` of shape `(batch_size, seq_len, num_channels)`, *optional*):
+            The tensors corresponding to the input time series signals.
+        ts_lens (`torch.Tensor` of shape `(batch_size,)`, *optional*):
+            The valid lengths of each time series signal in the batch.
+        ts_sr (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            The sampling rates of each time series signal in the batch.
         """
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
             )
             inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        if pixel_values is None and pixel_values_videos is None and ts_values is not None:
+            ts_features, ts_pad_mask = self.get_ts_feature(ts_values, ts_lens, ts_sr)  # [B, T, C], [B, T]
+            ts_features = ts_features[~ts_pad_mask].to(inputs_embeds.device, inputs_embeds.dtype)   # [num_valid_ts_tokens, C]
+            B, N, C = inputs_embeds.shape
+            input_ids = input_ids.reshape(B * N)
+            inputs_embeds = inputs_embeds.reshape(B * N, C)
+            # replace ts_token in inputs_embeds and attention_mask
+            ts_placeholder = (input_ids == self.config.ts_token_id)
+            n_ts_placeholders = ts_placeholder.sum().item()
+            n_ts_tokens = ts_features.size(0)
+            assert n_ts_placeholders == n_ts_tokens, f"[ERROR]: Mismatch: <TS_CONTEXT> tokens={n_ts_placeholders}, ts_embeds_valid={n_ts_tokens}"
+            try:
+                inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + ts_features
+            except Exception as e:
+                print(f'warning: {e}, inputs_embeds[selected].shape={inputs_embeds[ts_placeholder].shape}, ts_embeds_valid.shape={ts_features.shape}')
+                inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + n_ts_tokens[:n_ts_placeholders]
+            inputs_embeds = inputs_embeds.reshape(B, N, C)
+            # input_ids = input_ids.reshape(B, N)
         if position_ids is None:
             batch_size, seq_length = inputs_embeds.shape[:2]
             if cache_position is not None:
     def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
         return self.model.get_image_features(pixel_values, image_grid_thw)
+    def get_ts_feature(self, ts_values, ts_lens, sr):
+        return self.model.get_ts_feature(ts_values, ts_lens, sr)
     # Make modules available through conditional class for BC
     @property
     def language_model(self):
     @property
     def visual(self):
         return self.model.visual
+    def time_series(self):
+        return self.model.time_series
     @check_model_inputs
     def forward(
         pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
+        ts_values: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
+        ts_lens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
+        ts_sr: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             cache_position=cache_position,
+            ts_values=ts_values,
+            ts_lens=ts_lens,
+            ts_sr=ts_sr,
             **kwargs,
         )
         pixel_values_videos=None,
         image_grid_thw=None,
         video_grid_thw=None,
+        ts_values=None,
+        ts_lens=None,
+        ts_sr=None,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
             use_cache=use_cache,
+            ts_values=ts_values,
+            ts_lens=ts_lens,
+            ts_sr=ts_sr,
             **kwargs,
         )
         if cache_position[0] != 0:
             model_inputs["pixel_values"] = None
             model_inputs["pixel_values_videos"] = None
+            model_inputs["ts_values"] = None
+            model_inputs["ts_lens"] = None
+            model_inputs["ts_sr"] = None
         return model_inputs
 __all__ = [
     "InternS1ProVisionModel",
+    "InternS1ProTimeSeriesModel",
     "InternS1ProForConditionalGeneration",
     "InternS1ProModel",
     "InternS1ProPreTrainedModel",

processing_interns1_pro.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
 import numpy as np
@@ -28,6 +28,7 @@ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Proc
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils import logging
 from transformers.video_utils import VideoInput
 logger = logging.get_logger(__name__)
@@ -41,6 +42,7 @@ class InternS1ProProcessorKwargs(ProcessingKwargs, total=False):
             "return_mm_token_type_ids": False,
         },
         "videos_kwargs": {"return_metadata": True},
     }
@@ -68,6 +70,7 @@ class InternS1ProProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
         self.image_token_id = (
             tokenizer.image_token_id
             if getattr(tokenizer, "image_token_id", None)
@@ -78,6 +81,11 @@ class InternS1ProProcessor(ProcessorMixin):
             if getattr(tokenizer, "video_token_id", None)
             else tokenizer.convert_tokens_to_ids(self.video_token)
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
         self.vision_start_token = (
             "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
@@ -95,12 +103,132 @@ class InternS1ProProcessor(ProcessorMixin):
             if getattr(tokenizer, "vision_end_token_id", None)
             else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
     def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         videos: VideoInput = None,
         **kwargs: Unpack[InternS1ProProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -120,6 +248,7 @@ class InternS1ProProcessor(ProcessorMixin):
             videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -216,6 +345,22 @@ class InternS1ProProcessor(ProcessorMixin):
                 text[i] = text[i].replace("<|placeholder|>", self.video_token)
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
@@ -227,7 +372,7 @@ class InternS1ProProcessor(ProcessorMixin):
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
     def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
         """

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union,Optional
 import numpy as np
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils import logging
 from transformers.video_utils import VideoInput
+import os
 logger = logging.get_logger(__name__)
             "return_mm_token_type_ids": False,
         },
         "videos_kwargs": {"return_metadata": True},
+        "time_series_kwargs": {},
     }
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
         self.image_token_id = (
             tokenizer.image_token_id
             if getattr(tokenizer, "image_token_id", None)
             if getattr(tokenizer, "video_token_id", None)
             else tokenizer.convert_tokens_to_ids(self.video_token)
         )
+        self.ts_token_id = (
+            tokenizer.ts_token_id
+            if getattr(tokenizer, "ts_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_token)
+        )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
         self.vision_start_token = (
             "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
             if getattr(tokenizer, "vision_end_token_id", None)
             else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
+        self.ts_start_token = (
+            "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
+        )
+        self.ts_end_token = (
+            "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
+        )
+        self.ts_start_token_id = (
+            tokenizer.ts_start_token_id
+            if getattr(tokenizer, "ts_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_start_token)
+        )
+        self.ts_end_token_id = (
+            tokenizer.ts_end_token_id
+            if getattr(tokenizer, "ts_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_end_token)
+        )
+    def time_series_preprocessor(self,conversation):
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            conversations = conversation
+        else:
+            conversations = [conversation]
+        batch_time_series = []
+        batch_time_series_metadata = []
+        for conversation in conversations:
+            for message in conversation:
+                if message['role'] != "user": continue
+                time_series_fnames = [
+                    content["data"]
+                    for content in message["content"]
+                    if content.get("type") == "time_series" and "data" in content
+                ]
+                time_series_rates = [
+                    content.get("sampling_rate", None)
+                    for content in message["content"]
+                    if content.get("type") == "time_series"
+                ]
+                for path, rate in zip(time_series_fnames, time_series_rates):
+                    batch_time_series.append(path)
+                    batch_time_series_metadata.append(rate)
+        return {"time_series_paths": batch_time_series if batch_time_series else None,
+                "time_series_sampling_rates": batch_time_series_metadata if batch_time_series_metadata else None}
+    def time_series_processor(self,
+            ts_paths: list[str],
+            sampling_rates: list[float],
+            do_normalize=True,
+            do_truncate=True,
+    )-> BatchFeature:
+        assert len(ts_paths)==len(sampling_rates), "ts_paths and sampling_rates must have the same length"
+        ts_values=[]
+        ts_sr=[]
+        ts_lens=[]
+        for idx,ts_path in enumerate(ts_paths):
+            sr=sampling_rates[idx]
+            ext = os.path.splitext(ts_path)[-1].lower()
+            if ext in [".wav",'.mp3','.flac']:
+                try:
+                    import soundfile as sf
+                except ImportError:
+                    raise ImportError("Please install soundfile to process audio files.")
+                ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
+            elif ext == ".csv":
+                pd = __import__("pandas")
+                df = pd.read_csv(ts_path, header=None)
+                ts_input = df.values  # [T, C]
+            elif ext == ".npy":
+                ts_input = np.load(ts_path) # [T, C]
+            else:
+                raise ValueError(f"Unsupported file format: {ext}")
+            # ts_tensor = torch.from_numpy(ts_input).float()
+            if not isinstance(ts_input, np.ndarray):
+                ts_input = np.array(ts_input, dtype=np.float32)
+            if do_normalize:
+                mean = ts_input.mean(axis=0, keepdims=True)
+                std = ts_input.std(axis=0, keepdims=True)
+                ts_input = (ts_input - mean) / (std + 1e-8)
+            if do_truncate and len(ts_input)>240000:
+                ts_input=ts_input[:240000]  # truncate to 240k to avoid oom
+            if ts_input.ndim==1:
+                ts_input=ts_input[:, None]   #[T,C]
+            ts_len=ts_input.shape[0]
+            if sr is None or sr == 0:   # if no sr provided
+                sr = ts_len/4
+            ts_values.append(ts_input)
+            ts_sr.append(sr)
+            ts_lens.append(ts_len)
+        ts_lens = np.array(ts_lens)
+        ts_sr = np.array(ts_sr)
+        num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr,
+                                              ts_lens=ts_lens)
+        return BatchFeature(data={"ts_values": ts_values,
+                                    "ts_sr":ts_sr,
+                                    "ts_lens":ts_lens,
+                                    "num_ts_tokens":num_ts_tokens}
+                                   )
+    def _get_num_ts_tokens(self,sampling_rates,ts_lens):
+        strides = np.floor(160/((1+np.exp(-sampling_rates/100))**6))
+        patch_sizes = strides * 2
+        embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
+        num_ts_tokens=[(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
+        return num_ts_tokens
     def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         videos: VideoInput = None,
+        time_series_paths: Optional[list[str]]=None,
+        time_series_sampling_rates: Optional[list[float]]=None,
         **kwargs: Unpack[InternS1ProProcessorKwargs],
     ) -> BatchFeature:
         """
             videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            time_series_signals (`list[np.ndarray]`, `list[torch.Tensor]`):
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        time_series_inputs = {}
+        if images is  None and videos is None and time_series_paths is not None:
+            assert time_series_sampling_rates is not None, "If time_series_signals is provided, time_series_sampling_rates must also be provided."
+            assert len(time_series_paths) == len(time_series_sampling_rates), "The number of time series signals must match the number of sampling rates."
+            time_series_inputs = self.time_series_processor(ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates)
+            num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
+            assert len(num_ts_tokens) == len(text), "The number of time series signals must match the number of text prompts."
+            for i in range(len(text)):
+                if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
+                    ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
+                    text[i] = text[i].replace(
+                            f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
+                        )
+                elif self.ts_token in text[i]:
+                    text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs,**time_series_inputs}, tensor_type=return_tensors)
     def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
         """

test_inference_ts.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from pathlib import Path
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
+model_path = Path(__file__).parent.resolve()
+print(f"Loading model from: {model_path}")
+# 加载模型配置
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+print(f"Model config: {config.model_type}")
+print(f"Architecture: {config.architectures}")
+# 加载处理器（tokenizer + image processor + ts processor）
+print("\nLoading processor...")
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+# 加载模型（使用 bfloat16 精度和自动设备映射）
+print("\nLoading model...")
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    # attn_implementation="flash_attention_2",  #时序暂不支持flash_attn，load加这行会报错
+    trust_remote_code=True
+)
+print(f"✓ Model loaded successfully!")
+print(f"Model type: {type(model).__name__}")
+print(f"Model device: {model.device}")
+# ============================================================================
+# 测试 3: 时序对话
+# ============================================================================
+print("\n" + "=" * 80)
+print("测试 3: 时序对话")
+print("=" * 80)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "time_series", "data": "./0092638_seism.npy", "sampling_rate": 100},
+            {"type": "text", "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."},
+        ],
+    }
+]
+time_series_inputs = processor.time_series_preprocessor(messages)
+multimodal_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", enable_thinking=False, **time_series_inputs).to(model.device, dtype=torch.bfloat16)
+print("\n生成时序回复...")
+with torch.inference_mode():
+    multimodal_generated_ids = model.generate(
+        **multimodal_inputs,
+        max_new_tokens=200,
+        do_sample=False,
+        temperature=1.0,
+    )
+# 提取生成的 token（去除输入部分）
+multimodal_generated_ids_trimmed = [
+    out_ids[len(in_ids):] for in_ids, out_ids in zip(multimodal_inputs.input_ids, multimodal_generated_ids)
+]
+# 解码为文本
+multimodal_output = processor.batch_decode(
+    multimodal_generated_ids_trimmed,
+    skip_special_tokens=True,
+    clean_up_tokenization_spaces=False
+)
+print("\n" + "-" * 80)
+print("时序输出：")
+print("-" * 80)
+print(multimodal_output[0])
+print("-" * 80)
+print("\n✅ 时序功能测试完成！")