QipengGuo yehaochen commited on
Commit
619e440
·
1 Parent(s): e00807a

[Feature] Update time series (#13)

Browse files

- [Feature] update time series model (d9bead416bc2e29ee7e2d4f964840a5699007664)
- [Weight] Update time series safetensors (262600bfe6a5013e9407e3f6b7be303a3dbdce42)
- [test] update time series test scripts (bbe1e5b60f58e4587d421dca5d74cea85e17e891)
- [Fix] Remove hard requirements of pandas (f2a60cf1369f53de255813688aad3557d0957391)


Co-authored-by: yehaochen <yehaochen@users.noreply.huggingface.co>

0092638_seism.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2b94653c6964b630038897a27cb6d276ff866d9ecd1f6419358b9407f0df62e
3
+ size 72128
chat_template.jinja CHANGED
@@ -17,6 +17,8 @@
17
  {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
18
  {%- elif 'text' in item %}
19
  {{- item.text }}
 
 
20
  {%- endif %}
21
  {%- endfor %}
22
  {%- endif %}
 
17
  {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
18
  {%- elif 'text' in item %}
19
  {{- item.text }}
20
+ {%- elif 'time_series' in item or item.type == 'time_series' %}
21
+ {{- '<|ts|><TS_CONTEXT><|/ts|>'-}}
22
  {%- endif %}
23
  {%- endfor %}
24
  {%- endif %}
config.json CHANGED
@@ -58,6 +58,37 @@
58
  },
59
  "vision_end_token_id": 151653,
60
  "vision_start_token_id": 151652,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  "auto_map": {
62
  "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
63
  "AutoModel": "modeling_interns1_pro.InternS1ProModel",
@@ -141,6 +172,7 @@
141
  "model.visual.blocks.17.mlp.linear_fc1",
142
  "model.visual.blocks.4.norm2",
143
  "model.visual.blocks.17.attn.qkv",
 
144
  "model.language_model.layers.83.self_attn.k_norm",
145
  "model.language_model.layers.47.post_attention_layernorm",
146
  "model.language_model.layers.59.input_layernorm",
 
58
  },
59
  "vision_end_token_id": 151653,
60
  "vision_start_token_id": 151652,
61
+ "ts_config": {
62
+ "auto_map": {
63
+ "AutoConfig": "configuration_interns1_pro.InternS1ProTimeSeriesConfig",
64
+ "AutoModel": "modeling_interns1_pro.InternS1ProTimeSeriesModel"
65
+ },
66
+ "activation_dropout": 0.0,
67
+ "activation_function": "gelu",
68
+ "architectures": [
69
+ "InternS1TimeSeriesModel"
70
+ ],
71
+ "attention_dropout": 0.0,
72
+ "d_model": 768,
73
+ "dropout": 0.0,
74
+ "dtype": "bfloat16",
75
+ "encoder_attention_heads": 8,
76
+ "encoder_ffn_dim": 3072,
77
+ "encoder_layerdrop": 0.0,
78
+ "encoder_layers": 17,
79
+ "model_type": "interns1_pro_time_series",
80
+ "max_source_positions": 1500,
81
+ "num_mel_bins": 80,
82
+ "out_hidden_size": 4096,
83
+ "scale_embedding": false,
84
+ "ts_adapt_in_dim": 256,
85
+ "ts_adapt_out_dim": 1024,
86
+ "use_cache": true,
87
+ "attn_implementation": "eager"
88
+ },
89
+ "ts_end_id": 151684,
90
+ "ts_start_id": 151683,
91
+ "ts_token_id": 151685,
92
  "auto_map": {
93
  "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
94
  "AutoModel": "modeling_interns1_pro.InternS1ProModel",
 
172
  "model.visual.blocks.17.mlp.linear_fc1",
173
  "model.visual.blocks.4.norm2",
174
  "model.visual.blocks.17.attn.qkv",
175
+ "model.time_series",
176
  "model.language_model.layers.83.self_attn.k_norm",
177
  "model.language_model.layers.47.post_attention_layernorm",
178
  "model.language_model.layers.59.input_layernorm",
configuration_interns1_pro.py CHANGED
@@ -15,6 +15,7 @@
15
 
16
  from transformers.configuration_utils import PretrainedConfig
17
  from transformers.modeling_rope_utils import rope_config_validation
 
18
 
19
 
20
  class InternS1ProTextConfig(PretrainedConfig):
@@ -138,20 +139,61 @@ class InternS1ProVisionConfig(PretrainedConfig):
138
  self.num_position_embeddings = num_position_embeddings
139
  self.initializer_range = initializer_range
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  class InternS1ProConfig(PretrainedConfig):
143
  model_type = "interns1_pro"
144
- sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig}
145
  keys_to_ignore_at_inference = ["past_key_values"]
146
 
147
  def __init__(
148
  self,
149
  text_config=None,
150
  vision_config=None,
 
151
  image_token_id=151655,
152
  video_token_id=151656,
153
  vision_start_token_id=151652,
154
  vision_end_token_id=151653,
 
 
 
155
  tie_word_embeddings=False,
156
  **kwargs,
157
  ):
@@ -165,11 +207,19 @@ class InternS1ProConfig(PretrainedConfig):
165
  elif text_config is None:
166
  self.text_config = self.sub_configs["text_config"]()
167
 
 
 
 
 
 
168
  self.image_token_id = image_token_id
169
  self.video_token_id = video_token_id
170
  self.vision_start_token_id = vision_start_token_id
171
  self.vision_end_token_id = vision_end_token_id
 
 
 
172
  super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
173
 
174
 
175
- __all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"]
 
15
 
16
  from transformers.configuration_utils import PretrainedConfig
17
  from transformers.modeling_rope_utils import rope_config_validation
18
+ from transformers import WhisperConfig
19
 
20
 
21
  class InternS1ProTextConfig(PretrainedConfig):
 
139
  self.num_position_embeddings = num_position_embeddings
140
  self.initializer_range = initializer_range
141
 
142
+ class InternS1ProTimeSeriesConfig(WhisperConfig):
143
+
144
+ model_type = "interns1_pro_time_series"
145
+ base_config_key = "ts_config"
146
+
147
+ def __init__(
148
+ self,
149
+ ts_adapt_in_dim: int=256,
150
+ ts_adapt_out_dim: int=1024,
151
+ ts_hidden_dim: int=1024,
152
+ ts_cnn_channels: list[int]=[1, 32, 64, 128, 128],
153
+ ts_cnn_kernel_sizes: list[int]=[3, 5, 5, 5],
154
+ ts_cnn_strides: list[int]=[2, 4, 4, 5],
155
+ ts_cnn_paddings: list[int]=[1, 2, 2, 2],
156
+ ts_concat_subsampling_in_channels: int=128,
157
+ ts_concat_subsampling_concat_size: int=2,
158
+ use_flash_attn: bool=False,
159
+ **kwargs
160
+ ):
161
+ super().__init__(**kwargs)
162
+
163
+ self.ts_cnn_channels = ts_cnn_channels
164
+ self.ts_cnn_kernel_sizes = ts_cnn_kernel_sizes
165
+ self.ts_cnn_strides = ts_cnn_strides
166
+ self.ts_cnn_paddings = ts_cnn_paddings
167
+ self.ts_concat_subsampling_in_channels = ts_concat_subsampling_in_channels
168
+ self.ts_concat_subsampling_concat_size = ts_concat_subsampling_concat_size
169
+
170
+ self.ts_adapt_in_dim = ts_adapt_in_dim
171
+ self.ts_adapt_out_dim = ts_adapt_out_dim
172
+
173
+ self.ts_hidden_dim = ts_hidden_dim
174
+ self.use_flash_attn = use_flash_attn
175
+
176
+ assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
177
+ assert self.ts_concat_subsampling_in_channels == self.ts_cnn_channels[-1], "ts_concat_subsampling_in_channels should be equal to the out_channel of the last cnn layer"
178
+
179
 
180
  class InternS1ProConfig(PretrainedConfig):
181
  model_type = "interns1_pro"
182
+ sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig, 'ts_config':InternS1ProTimeSeriesConfig}
183
  keys_to_ignore_at_inference = ["past_key_values"]
184
 
185
  def __init__(
186
  self,
187
  text_config=None,
188
  vision_config=None,
189
+ ts_config=None,
190
  image_token_id=151655,
191
  video_token_id=151656,
192
  vision_start_token_id=151652,
193
  vision_end_token_id=151653,
194
+ ts_token_id=151685,
195
+ ts_start_id=151683,
196
+ ts_end_id=151684,
197
  tie_word_embeddings=False,
198
  **kwargs,
199
  ):
 
207
  elif text_config is None:
208
  self.text_config = self.sub_configs["text_config"]()
209
 
210
+ if isinstance(ts_config, dict):
211
+ self.ts_config = self.sub_configs["ts_config"](**ts_config)
212
+ elif ts_config is None:
213
+ self.ts_config = self.sub_configs["ts_config"]()
214
+
215
  self.image_token_id = image_token_id
216
  self.video_token_id = video_token_id
217
  self.vision_start_token_id = vision_start_token_id
218
  self.vision_end_token_id = vision_end_token_id
219
+ self.ts_token_id = ts_token_id
220
+ self.ts_start_id = ts_start_id
221
+ self.ts_end_id = ts_end_id
222
  super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
223
 
224
 
225
+ __all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig", "InternS1ProTimeSeriesConfig"]
model-time_series-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fab87c45c01a8695f97b5801bee2771ac6e874561ac773983397d958f1e7a00
3
+ size 291982664
model-time_series-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4150fadfb90bd9561c422b37ecc83fd5a30966f1e555bc9305b9fd5d2c914b0d
3
+ size 10240128
model.safetensors.index.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7de640c8e6f374c36de64b925b2c107896731ef642283e490e69125ec5c4eac1
3
- size 32204741
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa1acb6e462542ccb55d50c9ba2097df081b6fd69b8ac5aaed1f0b30b14678e
3
+ size 32236540
modeling_interns1_pro.py CHANGED
@@ -34,8 +34,10 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
34
  from transformers.processing_utils import Unpack
35
  from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
36
  from transformers.utils.generic import OutputRecorder, check_model_inputs
37
- from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
38
-
 
 
39
 
40
  @use_kernel_forward_from_hub("RMSNorm")
41
  class Qwen3VLMoeTextRMSNorm(nn.Module):
@@ -439,7 +441,7 @@ class InternS1ProPreTrainedModel(PreTrainedModel):
439
  config: InternS1ProConfig
440
  base_model_prefix = "model"
441
  supports_gradient_checkpointing = True
442
- _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
443
  _skip_keys_device_placement = ["past_key_values"]
444
  _supports_flash_attn = True
445
  _supports_sdpa = True
@@ -1057,6 +1059,442 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
1057
  )
1058
 
1059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
  @dataclass
1061
  @auto_docstring(
1062
  custom_intro="""
@@ -1118,12 +1556,13 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1118
  # Reference: fix gemma3 grad acc #37208
1119
  accepts_loss_kwargs = False
1120
  config: InternS1ProConfig
1121
- _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
1122
 
1123
  def __init__(self, config):
1124
  super().__init__(config)
1125
  self.visual = InternS1ProVisionModel._from_config(config.vision_config)
1126
  self.language_model = InternS1ProTextModel._from_config(config.text_config)
 
1127
 
1128
  # Initialize weights and apply final processing
1129
  self.post_init()
@@ -1170,6 +1609,15 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1170
  split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1171
  image_embeds = torch.split(image_embeds, split_sizes)
1172
  return image_embeds
 
 
 
 
 
 
 
 
 
1173
 
1174
  def get_placeholder_mask(
1175
  self,
@@ -1225,6 +1673,9 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1225
  image_grid_thw: Optional[torch.LongTensor] = None,
1226
  video_grid_thw: Optional[torch.LongTensor] = None,
1227
  cache_position: Optional[torch.LongTensor] = None,
 
 
 
1228
  **kwargs: Unpack[TransformersKwargs],
1229
  ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
1230
  r"""
@@ -1232,6 +1683,12 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1232
  The temporal, height and width of feature shape of each image in LLM.
1233
  video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1234
  The temporal, height and width of feature shape of each video in LLM.
 
 
 
 
 
 
1235
  """
1236
  if (input_ids is None) ^ (inputs_embeds is not None):
1237
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1258,6 +1715,27 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1258
  )
1259
  inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1261
  if position_ids is None:
1262
  batch_size, seq_length = inputs_embeds.shape[:2]
1263
  if cache_position is not None:
@@ -1396,6 +1874,8 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1396
  def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1397
  return self.model.get_image_features(pixel_values, image_grid_thw)
1398
 
 
 
1399
  # Make modules available through conditional class for BC
1400
  @property
1401
  def language_model(self):
@@ -1404,6 +1884,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1404
  @property
1405
  def visual(self):
1406
  return self.model.visual
 
 
 
1407
 
1408
  @check_model_inputs
1409
  def forward(
@@ -1418,6 +1901,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1418
  pixel_values_videos: Optional[torch.FloatTensor] = None,
1419
  image_grid_thw: Optional[torch.LongTensor] = None,
1420
  video_grid_thw: Optional[torch.LongTensor] = None,
 
 
 
1421
  cache_position: Optional[torch.LongTensor] = None,
1422
  logits_to_keep: Union[int, torch.Tensor] = 0,
1423
  **kwargs: Unpack[TransformersKwargs],
@@ -1484,6 +1970,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1484
  past_key_values=past_key_values,
1485
  inputs_embeds=inputs_embeds,
1486
  cache_position=cache_position,
 
 
 
1487
  **kwargs,
1488
  )
1489
 
@@ -1530,6 +2019,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1530
  pixel_values_videos=None,
1531
  image_grid_thw=None,
1532
  video_grid_thw=None,
 
 
 
1533
  **kwargs,
1534
  ):
1535
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1546,6 +2038,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1546
  image_grid_thw=image_grid_thw,
1547
  video_grid_thw=video_grid_thw,
1548
  use_cache=use_cache,
 
 
 
1549
  **kwargs,
1550
  )
1551
 
@@ -1554,6 +2049,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1554
  if cache_position[0] != 0:
1555
  model_inputs["pixel_values"] = None
1556
  model_inputs["pixel_values_videos"] = None
 
 
 
1557
 
1558
  return model_inputs
1559
 
@@ -1697,6 +2195,7 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1697
 
1698
  __all__ = [
1699
  "InternS1ProVisionModel",
 
1700
  "InternS1ProForConditionalGeneration",
1701
  "InternS1ProModel",
1702
  "InternS1ProPreTrainedModel",
 
34
  from transformers.processing_utils import Unpack
35
  from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
36
  from transformers.utils.generic import OutputRecorder, check_model_inputs
37
+ from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig, InternS1ProTimeSeriesConfig
38
+ from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
39
+ from transformers import WhisperPreTrainedModel
40
+ import math
41
 
42
  @use_kernel_forward_from_hub("RMSNorm")
43
  class Qwen3VLMoeTextRMSNorm(nn.Module):
 
441
  config: InternS1ProConfig
442
  base_model_prefix = "model"
443
  supports_gradient_checkpointing = True
444
+ _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
445
  _skip_keys_device_placement = ["past_key_values"]
446
  _supports_flash_attn = True
447
  _supports_sdpa = True
 
1059
  )
1060
 
1061
 
1062
+ class InternS1ProTimeSeriesEncoder(WhisperPreTrainedModel):
1063
+ def __init__(self, config: InternS1ProTimeSeriesConfig):
1064
+ super().__init__(config)
1065
+ self.config = config
1066
+ self.dropout = config.dropout
1067
+ self.layerdrop = config.encoder_layerdrop
1068
+
1069
+ self.embed_dim = config.d_model
1070
+ self.num_mel_bins = config.num_mel_bins
1071
+ self.padding_idx = config.pad_token_id
1072
+ self.max_source_positions = config.max_source_positions
1073
+ self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
1074
+
1075
+ self.conv1 = nn.Conv1d(self.num_mel_bins, self.embed_dim, kernel_size=3, padding=1)
1076
+ self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=3, stride=2, padding=1)
1077
+ self.embed_positions = nn.Embedding(self.max_source_positions, self.embed_dim)
1078
+
1079
+ self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
1080
+ self.layer_norm = nn.LayerNorm(config.d_model)
1081
+
1082
+ self.gradient_checkpointing = False
1083
+ self.post_init()
1084
+
1085
+ self.mask_type = None
1086
+ self.chunk_length = None
1087
+
1088
+ self.adapt_in = nn.Linear(config.ts_adapt_in_dim, 80)
1089
+ self.adapt_out = nn.Linear(self.embed_dim, config.ts_adapt_out_dim)
1090
+
1091
+ def _freeze_parameters(self):
1092
+ for param in self.parameters():
1093
+ param.requires_grad = False
1094
+ self._requires_grad = False
1095
+
1096
+ def get_input_embeddings(self) -> nn.Module:
1097
+ return self.conv1
1098
+
1099
+ def set_input_embeddings(self, value: nn.Module):
1100
+ self.conv1 = value
1101
+
1102
+ def define_masktype(self, masktype, chunk_length=None):
1103
+ self.mask_type = masktype
1104
+ self.chunk_length = chunk_length
1105
+
1106
+ def _make_causal_mask(self,
1107
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
1108
+ ):
1109
+ """
1110
+ Make causal mask used for bi-directional self-attention.
1111
+ """
1112
+ bsz, tgt_len = input_ids_shape
1113
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
1114
+ mask_cond = torch.arange(mask.size(-1), device=device)
1115
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
1116
+ mask = mask.to(dtype)
1117
+
1118
+ if past_key_values_length > 0:
1119
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
1120
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
1121
+
1122
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
1123
+ def _expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
1124
+ """
1125
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
1126
+ """
1127
+ # print(mask.size())
1128
+ bsz, src_len = mask.size()
1129
+ tgt_len = tgt_len if tgt_len is not None else src_len
1130
+
1131
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
1132
+
1133
+ inverted_mask = 1.0 - expanded_mask
1134
+
1135
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
1136
+
1137
+
1138
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
1139
+ # create causal mask
1140
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1141
+ combined_attention_mask = None
1142
+
1143
+ if input_shape[-1] > 1:
1144
+ combined_attention_mask = self._make_causal_mask(
1145
+ input_shape,
1146
+ inputs_embeds.dtype,
1147
+ device=inputs_embeds.device,
1148
+ past_key_values_length=past_key_values_length,
1149
+ )
1150
+
1151
+ if attention_mask is not None:
1152
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1153
+ expanded_attn_mask = self._expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
1154
+ combined_attention_mask = (
1155
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
1156
+ )
1157
+ return combined_attention_mask
1158
+
1159
+ def prepare_chunk_attention_mask(self, attention_mask, input_shape, inputs_embeds):
1160
+
1161
+ block_size = round(self.chunk_length / 4 * 2)
1162
+ matrix_size = input_shape[1]
1163
+
1164
+ matrix = torch.ones(matrix_size, matrix_size)
1165
+
1166
+ num_full_blocks = round(matrix_size // block_size)
1167
+ remainder = matrix_size % block_size
1168
+ for i in range(num_full_blocks):
1169
+ row_start = i * block_size
1170
+ col_start = i * block_size
1171
+ matrix[row_start:row_start + block_size, col_start:col_start + block_size] = torch.zeros(block_size, block_size)
1172
+
1173
+ if remainder > 0:
1174
+ last_row_start = num_full_blocks * block_size
1175
+ last_col_start = num_full_blocks * block_size
1176
+ matrix[last_row_start:last_row_start + remainder, last_col_start:last_col_start + remainder] = torch.zeros(remainder, remainder)
1177
+
1178
+ matrix = matrix * -65504
1179
+ matrix = matrix.unsqueeze(0).unsqueeze(0).repeat(input_shape[0], 1, 1, 1)
1180
+ attention_mask = matrix.to(inputs_embeds.device)
1181
+ return attention_mask
1182
+
1183
+ def forward(
1184
+ self,
1185
+ input_features,
1186
+ attention_mask=None,
1187
+ head_mask=None,
1188
+ output_attentions=None,
1189
+ output_hidden_states=None,
1190
+ return_dict=None,
1191
+ ):
1192
+ # (N, T, C) -> (T, N, C) -> (N, C, T)
1193
+ input_features = input_features.permute(1, 0, 2)
1194
+ input_features = self.adapt_in(input_features)
1195
+ input_features = input_features.permute(1, 2, 0)
1196
+
1197
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1198
+ output_hidden_states = (
1199
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1200
+ )
1201
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1202
+
1203
+ # (N, C, T) -> (N, C, T//2)
1204
+ inputs_embeds = nn.functional.gelu(self.conv1(input_features))
1205
+ inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
1206
+
1207
+ # (N, C, T) -> (N, T, C)
1208
+ inputs_embeds = inputs_embeds.permute(0, 2, 1) # torch.Size([1, 100, 768])
1209
+ embed_pos = self.embed_positions.weight # torch.Size([1500, 768])
1210
+
1211
+ if inputs_embeds.shape[1] > embed_pos.shape[0]:
1212
+ target_len = inputs_embeds.shape[1]
1213
+ padding = [0, 0, 0, target_len-embed_pos.shape[0]]
1214
+
1215
+ embed_pos = nn.functional.pad(embed_pos, pad=padding, mode='constant', value=0)
1216
+ hidden_states = inputs_embeds[:, :embed_pos.shape[0], :] + embed_pos
1217
+ else:
1218
+ hidden_states = inputs_embeds + embed_pos[:inputs_embeds.shape[1], :]
1219
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1220
+
1221
+ encoder_states = () if output_hidden_states else None
1222
+ all_attentions = () if output_attentions else None
1223
+
1224
+ input_shape = inputs_embeds.size()[:-1]
1225
+ past_key_values_length = 0
1226
+ attention_mask = None
1227
+ if self.mask_type == 'chunk':
1228
+ attention_mask = self.prepare_chunk_attention_mask(attention_mask, input_shape, inputs_embeds)
1229
+ else:
1230
+ attention_mask = self._prepare_decoder_attention_mask(
1231
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
1232
+ )
1233
+
1234
+ if head_mask is not None:
1235
+ assert head_mask.size()[0] == (
1236
+ len(self.layers)
1237
+ ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
1238
+
1239
+ for idx, encoder_layer in enumerate(self.layers):
1240
+ if output_hidden_states:
1241
+ encoder_states = encoder_states + (self.layer_norm(hidden_states),)
1242
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
1243
+ to_drop = False
1244
+ if self.training:
1245
+ dropout_probability = torch.rand([])
1246
+ if dropout_probability < self.layerdrop: # skip the layer
1247
+ to_drop = True
1248
+
1249
+ if to_drop:
1250
+ layer_outputs = (None, None)
1251
+ else:
1252
+ if self.gradient_checkpointing and self.training:
1253
+
1254
+ def create_custom_forward(module):
1255
+ def custom_forward(*inputs):
1256
+ return module(*inputs, output_attentions)
1257
+
1258
+ return custom_forward
1259
+
1260
+ layer_outputs = torch.utils.checkpoint.checkpoint(
1261
+ create_custom_forward(encoder_layer),
1262
+ hidden_states,
1263
+ attention_mask,
1264
+ (head_mask[idx] if head_mask is not None else None),
1265
+ )
1266
+ else:
1267
+ layer_outputs = encoder_layer(
1268
+ hidden_states,
1269
+ attention_mask,
1270
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
1271
+ output_attentions=output_attentions,
1272
+ )
1273
+
1274
+ hidden_states = layer_outputs[0]
1275
+
1276
+ if output_attentions:
1277
+ all_attentions = all_attentions + (layer_outputs[1],)
1278
+
1279
+ # (N, T, C) -> (T, N, C)
1280
+ hidden_states = hidden_states.permute(1, 0, 2)
1281
+ hidden_states = self.layer_norm(hidden_states)
1282
+ hidden_states = self.adapt_out(hidden_states)
1283
+
1284
+ # (T, N, C) -> (N, T, C)
1285
+ hidden_states = hidden_states.permute(1, 0, 2)
1286
+ if output_hidden_states:
1287
+ encoder_states = encoder_states + (hidden_states,)
1288
+
1289
+ if not return_dict:
1290
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
1291
+ return ModelOutput(
1292
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
1293
+ )
1294
+
1295
+
1296
+ class InternS1ProTimeSeriesConcatSubsampling(nn.Module):
1297
+ def __init__(self, in_channels: int, concat_size: int):
1298
+ super().__init__()
1299
+ self.in_channels = in_channels
1300
+ self.out_channels = in_channels * concat_size
1301
+
1302
+ def forward(self, ts_signals: torch.Tensor, ts_lens: torch.Tensor):
1303
+ if ts_signals.shape[1] % 2 != 0:
1304
+ ts_signals = ts_signals[:, :-1, :]
1305
+ even_frames = ts_signals[:, ::2, :]
1306
+ odd_frames = ts_signals[:, 1::2, :]
1307
+ ts_signals = torch.cat((even_frames, odd_frames), dim=2)
1308
+ ts_lens = ts_lens // 2
1309
+ return ts_signals, ts_lens
1310
+
1311
+
1312
+ class InternS1ProTimeSeriesFixPositionalEncoding(nn.Module):
1313
+ def __init__(self, d_model, max_len=20000):
1314
+ super().__init__()
1315
+ pe = torch.zeros(max_len, d_model,dtype=torch.float)
1316
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
1317
+ div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
1318
+ pe[:, 0::2] = torch.sin(position * div_term)
1319
+ pe[:, 1::2] = torch.cos(position * div_term)
1320
+ pe = pe.unsqueeze(0).transpose(0, 1).to(torch.float32) # (max_len, 1, d_model)
1321
+ self.register_buffer('pe', pe, persistent=True)
1322
+
1323
+ def forward(self, x):
1324
+ # x: (seq_len, batch_size, d_model)
1325
+ x = x + self.pe[:x.size(0), :]
1326
+ return x.clone()
1327
+
1328
+
1329
+ class InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling(nn.Module):
1330
+ def __init__(self, hidden_dim=128, nhead=8,num_encoder_layers = 1):
1331
+ super().__init__()
1332
+ self.conv = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=5, stride=1, padding=2)
1333
+ encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
1334
+ self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
1335
+ self.pos_encoder = InternS1ProTimeSeriesFixPositionalEncoding(d_model=hidden_dim)
1336
+ self.subsampling = InternS1ProTimeSeriesConcatSubsampling(128, 2)
1337
+
1338
+
1339
+ def forward(self, inputs, input_lens, sr):
1340
+ features, feature_lens = self.forward_patch(inputs, input_lens, sr)
1341
+ outputs = features
1342
+ output_lens = feature_lens
1343
+ return outputs, output_lens
1344
+
1345
+ def forward_patch(self, inputs, input_lens, sr):
1346
+ sr = sr.float()
1347
+ strides = torch.floor(160/((1+torch.exp(-sr/100))**6))
1348
+ patch_sizes = strides * 2
1349
+ patched_outputs = []
1350
+ output_lens = []
1351
+
1352
+ for i in range(len(inputs)):
1353
+ seq = inputs[i] # [seq_len, num_channel]
1354
+ ps = patch_sizes[i].item()
1355
+ st = strides[i].item()
1356
+ le = input_lens[i]
1357
+
1358
+ output_len = torch.ceil((le - ps) / st) + 1
1359
+ pad_len = ((output_len - 1) * st + ps - le).long().item()
1360
+ if seq.ndim == 1:
1361
+ seq = seq.unsqueeze(-1)
1362
+ seq = nn.functional.pad(seq, (0, 0, 0, pad_len), "constant", 0)
1363
+ assert output_len > 0,(seq.shape, ps,st,le,output_len)
1364
+ output_lens.append(output_len)
1365
+ indices = (torch.arange(0, output_len * st, st).unsqueeze(1) + torch.arange(ps)).long()
1366
+ patched = seq[indices]
1367
+
1368
+ output = self.forward_encoder(patched) #[num_patch, D]
1369
+ patched_outputs.append(output)
1370
+
1371
+ outputs = nn.utils.rnn.pad_sequence(patched_outputs, batch_first=True)
1372
+ output_lens = torch.tensor(output_lens).squeeze().to(outputs.device).long()
1373
+ if output_lens.ndim == 0:
1374
+ output_lens = output_lens.unsqueeze(0)
1375
+
1376
+ outputs, output_lens = self.subsampling(outputs.clone(), output_lens.clone())
1377
+ return outputs, output_lens
1378
+
1379
+ def forward_encoder(self, x):
1380
+ num_patch, patch_len, C = x.shape
1381
+ # conv1
1382
+ x = x.reshape(num_patch*C, 1, patch_len) # 每个 channel 当作独立样本送入 conv1
1383
+ x = nn.functional.relu((self.conv(x))) # [B*C, D1, L]
1384
+ x = x.permute(2,0,1) # [L, B*C, D1]
1385
+
1386
+ x = self.pos_encoder(x) # [L, B*C, D1]
1387
+ x = self.transformer_encoder(x.to(torch.bfloat16))
1388
+ x = x.mean(0)
1389
+
1390
+ x = x.reshape(num_patch,C,-1)
1391
+
1392
+ return x.mean(1)
1393
+
1394
+ class InternS1ProTimeSeriesProjector(nn.Module):
1395
+ def __init__(self, config: InternS1ProTimeSeriesConfig):
1396
+ super().__init__()
1397
+ self.layer_norm = nn.LayerNorm(config.ts_hidden_dim)
1398
+ self.linear_1 = nn.Linear(config.ts_hidden_dim, config.out_hidden_size)
1399
+ self.act = ACT2FN[config.activation_function]
1400
+ self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size)
1401
+
1402
+ def forward(self, ts_features):
1403
+ hidden_states = self.layer_norm(ts_features)
1404
+ hidden_states = self.linear_1(hidden_states)
1405
+ hidden_states = self.act(hidden_states)
1406
+ hidden_states = self.linear_2(hidden_states)
1407
+ return hidden_states
1408
+
1409
+ class InternS1ProTimeSeriesModel(InternS1ProPreTrainedModel):
1410
+ main_input_name = 'time_series_signals'
1411
+ _supports_flash_attn_2 = False
1412
+ config_class = InternS1ProTimeSeriesConfig
1413
+ _no_split_modules = ['WhisperEncoderLayer']
1414
+
1415
+ def __init__(self, config: InternS1ProTimeSeriesConfig):
1416
+ super().__init__(config)
1417
+ self.config = config
1418
+ self.encoder_embed = InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling()
1419
+ self.encoder = InternS1ProTimeSeriesEncoder(config)
1420
+ self.projector = InternS1ProTimeSeriesProjector(config)
1421
+
1422
+ def get_input_embeddings(self):
1423
+ return self.encoder_embed
1424
+
1425
+ def make_pad_mask(self, lengths: torch.Tensor) -> torch.Tensor:
1426
+ """
1427
+ Args:
1428
+ lengths:
1429
+ A 1-D tensor containing sentence lengths.
1430
+ max_len:
1431
+ The length of masks.
1432
+ Returns:
1433
+ Return a 2-D bool tensor, where masked positions
1434
+ are filled with `True` and non-masked positions are
1435
+ filled with `False`.
1436
+
1437
+ >>> lengths = torch.tensor([1, 3, 2, 5])
1438
+ >>> make_pad_mask(lengths)
1439
+ tensor([[False, True, True, True, True],
1440
+ [False, False, False, True, True],
1441
+ [False, False, True, True, True],
1442
+ [False, False, False, False, False]])
1443
+ """
1444
+ assert lengths.ndim == 1, lengths.ndim
1445
+ max_len = lengths.max()
1446
+ n = lengths.size(0)
1447
+ seq_range = torch.arange(0, max_len, device=lengths.device)
1448
+ expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
1449
+ return expaned_lengths >= lengths.unsqueeze(-1)
1450
+
1451
+ def forward(
1452
+ self,
1453
+ time_series_signals: Optional[torch.FloatTensor] = None,
1454
+ ts_lens: Optional[torch.Tensor] = None,
1455
+ sr: Optional[torch.Tensor] = None,
1456
+ output_hidden_states: Optional[bool] = None,
1457
+ return_dict: Optional[bool] = None,
1458
+ time_series_embeds: Optional[torch.FloatTensor] = None,
1459
+ ):
1460
+
1461
+ output_hidden_states = (
1462
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1463
+ )
1464
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1465
+
1466
+ if time_series_signals is None and time_series_embeds is None:
1467
+ raise ValueError('You have to specify time_series_signals or time_series_embeds')
1468
+
1469
+ if time_series_embeds is not None and len(time_series_embeds.shape) == 3 and time_series_embeds.shape[-1] == self.config.ts_adapt_in_dim:
1470
+ time_series_embeds = time_series_embeds
1471
+ else:
1472
+ if (isinstance(time_series_signals,list) and len(time_series_signals[0].shape) == 2) \
1473
+ or (isinstance(time_series_signals, torch.Tensor) and len(time_series_signals.shape) == 3):
1474
+ time_series_embeds, ts_lens = self.encoder_embed(time_series_signals, ts_lens, sr)
1475
+ else:
1476
+ raise ValueError(f'wrong time_series_signals size: {time_series_signals[0].shape}')
1477
+
1478
+ # [B, 64000, 1] -> [B, 200, 256] -> [B, 100, 1024]
1479
+ encoder_outputs = self.encoder(
1480
+ input_features=time_series_embeds,
1481
+ output_hidden_states=output_hidden_states,
1482
+ return_dict=return_dict,
1483
+ )
1484
+
1485
+ # ts_lens after encoder
1486
+ ts_lens = (ts_lens+1) // 2
1487
+ assert torch.all(ts_lens > 0), f"The length of time_series_embeds is so small. ts_lens: {ts_lens}"
1488
+
1489
+ src_key_padding_mask = self.make_pad_mask(ts_lens)
1490
+ last_hidden_state = encoder_outputs.last_hidden_state
1491
+
1492
+ ts_pad_mask = src_key_padding_mask
1493
+ ts_embeds = self.projector(last_hidden_state)
1494
+
1495
+ return ts_embeds,ts_pad_mask
1496
+
1497
+
1498
  @dataclass
1499
  @auto_docstring(
1500
  custom_intro="""
 
1556
  # Reference: fix gemma3 grad acc #37208
1557
  accepts_loss_kwargs = False
1558
  config: InternS1ProConfig
1559
+ _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
1560
 
1561
  def __init__(self, config):
1562
  super().__init__(config)
1563
  self.visual = InternS1ProVisionModel._from_config(config.vision_config)
1564
  self.language_model = InternS1ProTextModel._from_config(config.text_config)
1565
+ self.time_series = InternS1ProTimeSeriesModel._from_config(config.ts_config)
1566
 
1567
  # Initialize weights and apply final processing
1568
  self.post_init()
 
1609
  split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1610
  image_embeds = torch.split(image_embeds, split_sizes)
1611
  return image_embeds
1612
+
1613
+ def get_ts_feature(self, ts_values, ts_lens, sr):
1614
+ ts_embeds, ts_pad_mask = self.time_series(
1615
+ time_series_signals=ts_values,
1616
+ ts_lens=ts_lens,
1617
+ sr=sr,
1618
+ output_hidden_states=False,
1619
+ return_dict=True)
1620
+ return ts_embeds, ts_pad_mask
1621
 
1622
  def get_placeholder_mask(
1623
  self,
 
1673
  image_grid_thw: Optional[torch.LongTensor] = None,
1674
  video_grid_thw: Optional[torch.LongTensor] = None,
1675
  cache_position: Optional[torch.LongTensor] = None,
1676
+ ts_values: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
1677
+ ts_lens: Union[torch.Tensor, list[torch.Tensor]] = None,
1678
+ ts_sr: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
1679
  **kwargs: Unpack[TransformersKwargs],
1680
  ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
1681
  r"""
 
1683
  The temporal, height and width of feature shape of each image in LLM.
1684
  video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1685
  The temporal, height and width of feature shape of each video in LLM.
1686
+ ts_values (`torch.FloatTensor` of shape `(batch_size, seq_len, num_channels)`, *optional*):
1687
+ The tensors corresponding to the input time series signals.
1688
+ ts_lens (`torch.Tensor` of shape `(batch_size,)`, *optional*):
1689
+ The valid lengths of each time series signal in the batch.
1690
+ ts_sr (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
1691
+ The sampling rates of each time series signal in the batch.
1692
  """
1693
  if (input_ids is None) ^ (inputs_embeds is not None):
1694
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
1715
  )
1716
  inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1717
 
1718
+ if pixel_values is None and pixel_values_videos is None and ts_values is not None:
1719
+ ts_features, ts_pad_mask = self.get_ts_feature(ts_values, ts_lens, ts_sr) # [B, T, C], [B, T]
1720
+ ts_features = ts_features[~ts_pad_mask].to(inputs_embeds.device, inputs_embeds.dtype) # [num_valid_ts_tokens, C]
1721
+ B, N, C = inputs_embeds.shape
1722
+ input_ids = input_ids.reshape(B * N)
1723
+ inputs_embeds = inputs_embeds.reshape(B * N, C)
1724
+ # replace ts_token in inputs_embeds and attention_mask
1725
+ ts_placeholder = (input_ids == self.config.ts_token_id)
1726
+ n_ts_placeholders = ts_placeholder.sum().item()
1727
+ n_ts_tokens = ts_features.size(0)
1728
+ assert n_ts_placeholders == n_ts_tokens, f"[ERROR]: Mismatch: <TS_CONTEXT> tokens={n_ts_placeholders}, ts_embeds_valid={n_ts_tokens}"
1729
+
1730
+ try:
1731
+ inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + ts_features
1732
+ except Exception as e:
1733
+ print(f'warning: {e}, inputs_embeds[selected].shape={inputs_embeds[ts_placeholder].shape}, ts_embeds_valid.shape={ts_features.shape}')
1734
+ inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + n_ts_tokens[:n_ts_placeholders]
1735
+
1736
+ inputs_embeds = inputs_embeds.reshape(B, N, C)
1737
+ # input_ids = input_ids.reshape(B, N)
1738
+
1739
  if position_ids is None:
1740
  batch_size, seq_length = inputs_embeds.shape[:2]
1741
  if cache_position is not None:
 
1874
  def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1875
  return self.model.get_image_features(pixel_values, image_grid_thw)
1876
 
1877
+ def get_ts_feature(self, ts_values, ts_lens, sr):
1878
+ return self.model.get_ts_feature(ts_values, ts_lens, sr)
1879
  # Make modules available through conditional class for BC
1880
  @property
1881
  def language_model(self):
 
1884
  @property
1885
  def visual(self):
1886
  return self.model.visual
1887
+
1888
+ def time_series(self):
1889
+ return self.model.time_series
1890
 
1891
  @check_model_inputs
1892
  def forward(
 
1901
  pixel_values_videos: Optional[torch.FloatTensor] = None,
1902
  image_grid_thw: Optional[torch.LongTensor] = None,
1903
  video_grid_thw: Optional[torch.LongTensor] = None,
1904
+ ts_values: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
1905
+ ts_lens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
1906
+ ts_sr: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
1907
  cache_position: Optional[torch.LongTensor] = None,
1908
  logits_to_keep: Union[int, torch.Tensor] = 0,
1909
  **kwargs: Unpack[TransformersKwargs],
 
1970
  past_key_values=past_key_values,
1971
  inputs_embeds=inputs_embeds,
1972
  cache_position=cache_position,
1973
+ ts_values=ts_values,
1974
+ ts_lens=ts_lens,
1975
+ ts_sr=ts_sr,
1976
  **kwargs,
1977
  )
1978
 
 
2019
  pixel_values_videos=None,
2020
  image_grid_thw=None,
2021
  video_grid_thw=None,
2022
+ ts_values=None,
2023
+ ts_lens=None,
2024
+ ts_sr=None,
2025
  **kwargs,
2026
  ):
2027
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
2038
  image_grid_thw=image_grid_thw,
2039
  video_grid_thw=video_grid_thw,
2040
  use_cache=use_cache,
2041
+ ts_values=ts_values,
2042
+ ts_lens=ts_lens,
2043
+ ts_sr=ts_sr,
2044
  **kwargs,
2045
  )
2046
 
 
2049
  if cache_position[0] != 0:
2050
  model_inputs["pixel_values"] = None
2051
  model_inputs["pixel_values_videos"] = None
2052
+ model_inputs["ts_values"] = None
2053
+ model_inputs["ts_lens"] = None
2054
+ model_inputs["ts_sr"] = None
2055
 
2056
  return model_inputs
2057
 
 
2195
 
2196
  __all__ = [
2197
  "InternS1ProVisionModel",
2198
+ "InternS1ProTimeSeriesModel",
2199
  "InternS1ProForConditionalGeneration",
2200
  "InternS1ProModel",
2201
  "InternS1ProPreTrainedModel",
processing_interns1_pro.py CHANGED
@@ -18,7 +18,7 @@
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
- from typing import Union
22
 
23
  import numpy as np
24
 
@@ -28,6 +28,7 @@ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Proc
28
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
  from transformers.utils import logging
30
  from transformers.video_utils import VideoInput
 
31
 
32
 
33
  logger = logging.get_logger(__name__)
@@ -41,6 +42,7 @@ class InternS1ProProcessorKwargs(ProcessingKwargs, total=False):
41
  "return_mm_token_type_ids": False,
42
  },
43
  "videos_kwargs": {"return_metadata": True},
 
44
  }
45
 
46
 
@@ -68,6 +70,7 @@ class InternS1ProProcessor(ProcessorMixin):
68
  def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
69
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
70
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
 
71
  self.image_token_id = (
72
  tokenizer.image_token_id
73
  if getattr(tokenizer, "image_token_id", None)
@@ -78,6 +81,11 @@ class InternS1ProProcessor(ProcessorMixin):
78
  if getattr(tokenizer, "video_token_id", None)
79
  else tokenizer.convert_tokens_to_ids(self.video_token)
80
  )
 
 
 
 
 
81
  super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
82
  self.vision_start_token = (
83
  "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
@@ -95,12 +103,132 @@ class InternS1ProProcessor(ProcessorMixin):
95
  if getattr(tokenizer, "vision_end_token_id", None)
96
  else tokenizer.convert_tokens_to_ids(self.vision_end_token)
97
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def __call__(
100
  self,
101
  images: ImageInput = None,
102
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
103
  videos: VideoInput = None,
 
 
104
  **kwargs: Unpack[InternS1ProProcessorKwargs],
105
  ) -> BatchFeature:
106
  """
@@ -120,6 +248,7 @@ class InternS1ProProcessor(ProcessorMixin):
120
  videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
121
  The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
122
  tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
 
123
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
124
  If set, will return tensors of a particular framework. Acceptable values are:
125
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -216,6 +345,22 @@ class InternS1ProProcessor(ProcessorMixin):
216
 
217
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
220
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
221
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
@@ -227,7 +372,7 @@ class InternS1ProProcessor(ProcessorMixin):
227
  mm_token_type_ids[array_ids == self.image_token_id] = 1
228
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
229
 
230
- return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
231
 
232
  def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
233
  """
 
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
+ from typing import Union,Optional
22
 
23
  import numpy as np
24
 
 
28
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
  from transformers.utils import logging
30
  from transformers.video_utils import VideoInput
31
+ import os
32
 
33
 
34
  logger = logging.get_logger(__name__)
 
42
  "return_mm_token_type_ids": False,
43
  },
44
  "videos_kwargs": {"return_metadata": True},
45
+ "time_series_kwargs": {},
46
  }
47
 
48
 
 
70
  def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
71
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
72
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
73
+ self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
74
  self.image_token_id = (
75
  tokenizer.image_token_id
76
  if getattr(tokenizer, "image_token_id", None)
 
81
  if getattr(tokenizer, "video_token_id", None)
82
  else tokenizer.convert_tokens_to_ids(self.video_token)
83
  )
84
+ self.ts_token_id = (
85
+ tokenizer.ts_token_id
86
+ if getattr(tokenizer, "ts_token_id", None)
87
+ else tokenizer.convert_tokens_to_ids(self.ts_token)
88
+ )
89
  super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
90
  self.vision_start_token = (
91
  "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
 
103
  if getattr(tokenizer, "vision_end_token_id", None)
104
  else tokenizer.convert_tokens_to_ids(self.vision_end_token)
105
  )
106
+ self.ts_start_token = (
107
+ "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
108
+ )
109
+ self.ts_end_token = (
110
+ "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
111
+ )
112
+ self.ts_start_token_id = (
113
+ tokenizer.ts_start_token_id
114
+ if getattr(tokenizer, "ts_start_token_id", None)
115
+ else tokenizer.convert_tokens_to_ids(self.ts_start_token)
116
+ )
117
+ self.ts_end_token_id = (
118
+ tokenizer.ts_end_token_id
119
+ if getattr(tokenizer, "ts_end_token_id", None)
120
+ else tokenizer.convert_tokens_to_ids(self.ts_end_token)
121
+ )
122
+
123
+ def time_series_preprocessor(self,conversation):
124
+ if isinstance(conversation, (list, tuple)) and (
125
+ isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
126
+ ):
127
+ conversations = conversation
128
+ else:
129
+ conversations = [conversation]
130
+
131
+ batch_time_series = []
132
+ batch_time_series_metadata = []
133
+ for conversation in conversations:
134
+ for message in conversation:
135
+ if message['role'] != "user": continue
136
+ time_series_fnames = [
137
+ content["data"]
138
+ for content in message["content"]
139
+ if content.get("type") == "time_series" and "data" in content
140
+ ]
141
+ time_series_rates = [
142
+ content.get("sampling_rate", None)
143
+ for content in message["content"]
144
+ if content.get("type") == "time_series"
145
+ ]
146
+ for path, rate in zip(time_series_fnames, time_series_rates):
147
+ batch_time_series.append(path)
148
+ batch_time_series_metadata.append(rate)
149
+
150
+ return {"time_series_paths": batch_time_series if batch_time_series else None,
151
+ "time_series_sampling_rates": batch_time_series_metadata if batch_time_series_metadata else None}
152
+
153
+ def time_series_processor(self,
154
+ ts_paths: list[str],
155
+ sampling_rates: list[float],
156
+ do_normalize=True,
157
+ do_truncate=True,
158
+
159
+ )-> BatchFeature:
160
+ assert len(ts_paths)==len(sampling_rates), "ts_paths and sampling_rates must have the same length"
161
+
162
+ ts_values=[]
163
+ ts_sr=[]
164
+ ts_lens=[]
165
+
166
+ for idx,ts_path in enumerate(ts_paths):
167
+ sr=sampling_rates[idx]
168
+ ext = os.path.splitext(ts_path)[-1].lower()
169
+ if ext in [".wav",'.mp3','.flac']:
170
+ try:
171
+ import soundfile as sf
172
+ except ImportError:
173
+ raise ImportError("Please install soundfile to process audio files.")
174
+ ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
175
+ elif ext == ".csv":
176
+ pd = __import__("pandas")
177
+ df = pd.read_csv(ts_path, header=None)
178
+ ts_input = df.values # [T, C]
179
+ elif ext == ".npy":
180
+ ts_input = np.load(ts_path) # [T, C]
181
+ else:
182
+ raise ValueError(f"Unsupported file format: {ext}")
183
+
184
+ # ts_tensor = torch.from_numpy(ts_input).float()
185
+ if not isinstance(ts_input, np.ndarray):
186
+ ts_input = np.array(ts_input, dtype=np.float32)
187
+
188
+ if do_normalize:
189
+ mean = ts_input.mean(axis=0, keepdims=True)
190
+ std = ts_input.std(axis=0, keepdims=True)
191
+ ts_input = (ts_input - mean) / (std + 1e-8)
192
+
193
+ if do_truncate and len(ts_input)>240000:
194
+ ts_input=ts_input[:240000] # truncate to 240k to avoid oom
195
+
196
+ if ts_input.ndim==1:
197
+ ts_input=ts_input[:, None] #[T,C]
198
+
199
+ ts_len=ts_input.shape[0]
200
+
201
+ if sr is None or sr == 0: # if no sr provided
202
+ sr = ts_len/4
203
+
204
+ ts_values.append(ts_input)
205
+ ts_sr.append(sr)
206
+ ts_lens.append(ts_len)
207
+
208
+ ts_lens = np.array(ts_lens)
209
+ ts_sr = np.array(ts_sr)
210
+ num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr,
211
+ ts_lens=ts_lens)
212
+ return BatchFeature(data={"ts_values": ts_values,
213
+ "ts_sr":ts_sr,
214
+ "ts_lens":ts_lens,
215
+ "num_ts_tokens":num_ts_tokens}
216
+ )
217
+
218
+ def _get_num_ts_tokens(self,sampling_rates,ts_lens):
219
+ strides = np.floor(160/((1+np.exp(-sampling_rates/100))**6))
220
+ patch_sizes = strides * 2
221
+ embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
222
+ num_ts_tokens=[(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
223
+ return num_ts_tokens
224
 
225
  def __call__(
226
  self,
227
  images: ImageInput = None,
228
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
229
  videos: VideoInput = None,
230
+ time_series_paths: Optional[list[str]]=None,
231
+ time_series_sampling_rates: Optional[list[float]]=None,
232
  **kwargs: Unpack[InternS1ProProcessorKwargs],
233
  ) -> BatchFeature:
234
  """
 
248
  videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
249
  The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
250
  tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
251
+ time_series_signals (`list[np.ndarray]`, `list[torch.Tensor]`):
252
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
253
  If set, will return tensors of a particular framework. Acceptable values are:
254
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
 
345
 
346
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
347
 
348
+ time_series_inputs = {}
349
+ if images is None and videos is None and time_series_paths is not None:
350
+ assert time_series_sampling_rates is not None, "If time_series_signals is provided, time_series_sampling_rates must also be provided."
351
+ assert len(time_series_paths) == len(time_series_sampling_rates), "The number of time series signals must match the number of sampling rates."
352
+ time_series_inputs = self.time_series_processor(ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates)
353
+ num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
354
+ assert len(num_ts_tokens) == len(text), "The number of time series signals must match the number of text prompts."
355
+ for i in range(len(text)):
356
+ if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
357
+ ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
358
+ text[i] = text[i].replace(
359
+ f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
360
+ )
361
+ elif self.ts_token in text[i]:
362
+ text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
363
+
364
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
365
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
366
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
372
  mm_token_type_ids[array_ids == self.image_token_id] = 1
373
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
374
 
375
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs,**time_series_inputs}, tensor_type=return_tensors)
376
 
377
  def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
378
  """
test_inference_ts.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import torch
3
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
4
+
5
+
6
+ model_path = Path(__file__).parent.resolve()
7
+ print(f"Loading model from: {model_path}")
8
+
9
+ # 加载模型配置
10
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
11
+ print(f"Model config: {config.model_type}")
12
+ print(f"Architecture: {config.architectures}")
13
+
14
+ # 加载处理器(tokenizer + image processor + ts processor)
15
+ print("\nLoading processor...")
16
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
17
+
18
+ # 加载模型(使用 bfloat16 精度和自动设备映射)
19
+ print("\nLoading model...")
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ model_path,
22
+ dtype=torch.bfloat16,
23
+ device_map="auto",
24
+ # attn_implementation="flash_attention_2", #时序暂不支持flash_attn,load加这行会报错
25
+ trust_remote_code=True
26
+ )
27
+
28
+ print(f"✓ Model loaded successfully!")
29
+ print(f"Model type: {type(model).__name__}")
30
+ print(f"Model device: {model.device}")
31
+
32
+ # ============================================================================
33
+ # 测试 3: 时序对话
34
+ # ============================================================================
35
+ print("\n" + "=" * 80)
36
+ print("测试 3: 时序对话")
37
+ print("=" * 80)
38
+
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ {"type": "time_series", "data": "./0092638_seism.npy", "sampling_rate": 100},
44
+ {"type": "text", "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."},
45
+ ],
46
+ }
47
+ ]
48
+
49
+ time_series_inputs = processor.time_series_preprocessor(messages)
50
+ multimodal_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", enable_thinking=False, **time_series_inputs).to(model.device, dtype=torch.bfloat16)
51
+
52
+ print("\n生成时序回复...")
53
+ with torch.inference_mode():
54
+ multimodal_generated_ids = model.generate(
55
+ **multimodal_inputs,
56
+ max_new_tokens=200,
57
+ do_sample=False,
58
+ temperature=1.0,
59
+ )
60
+
61
+ # 提取生成的 token(去除输入部分)
62
+ multimodal_generated_ids_trimmed = [
63
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(multimodal_inputs.input_ids, multimodal_generated_ids)
64
+ ]
65
+
66
+ # 解码为文本
67
+ multimodal_output = processor.batch_decode(
68
+ multimodal_generated_ids_trimmed,
69
+ skip_special_tokens=True,
70
+ clean_up_tokenization_spaces=False
71
+ )
72
+
73
+ print("\n" + "-" * 80)
74
+ print("时序输出:")
75
+ print("-" * 80)
76
+ print(multimodal_output[0])
77
+ print("-" * 80)
78
+ print("\n✅ 时序功能测试完成!")